Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
e78857ac
Unverified
Commit
e78857ac
authored
Dec 11, 2023
by
Hubert
Committed by
GitHub
Dec 11, 2023
Browse files
[Sync] minor test (#683)
parent
dd4318f6
Changes
57
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
489 additions
and
108 deletions
+489
-108
configs/models/hf_internlm/hf_internlm_chat_20b.py
configs/models/hf_internlm/hf_internlm_chat_20b.py
+1
-0
configs/models/hf_internlm/hf_internlm_chat_7b.py
configs/models/hf_internlm/hf_internlm_chat_7b.py
+1
-0
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
+1
-0
configs/models/qwen/hf_qwen_14b_chat.py
configs/models/qwen/hf_qwen_14b_chat.py
+3
-1
configs/models/qwen/hf_qwen_7b_chat.py
configs/models/qwen/hf_qwen_7b_chat.py
+3
-1
configs/summarizers/groups/cibench.py
configs/summarizers/groups/cibench.py
+4
-0
configs/summarizers/groups/mathbench.py
configs/summarizers/groups/mathbench.py
+75
-0
configs/summarizers/math_agent.py
configs/summarizers/math_agent.py
+28
-0
opencompass/datasets/cibench.py
opencompass/datasets/cibench.py
+154
-67
opencompass/datasets/cmnli.py
opencompass/datasets/cmnli.py
+2
-0
opencompass/datasets/ds1000.py
opencompass/datasets/ds1000.py
+11
-0
opencompass/datasets/gsm8k.py
opencompass/datasets/gsm8k.py
+1
-1
opencompass/datasets/wikibench.py
opencompass/datasets/wikibench.py
+1
-1
opencompass/datasets/winogrande.py
opencompass/datasets/winogrande.py
+14
-16
opencompass/lagent/actions/ipython_interpreter.py
opencompass/lagent/actions/ipython_interpreter.py
+26
-7
opencompass/models/base.py
opencompass/models/base.py
+14
-0
opencompass/models/base_api.py
opencompass/models/base_api.py
+17
-2
opencompass/models/huggingface.py
opencompass/models/huggingface.py
+125
-7
opencompass/models/lagent.py
opencompass/models/lagent.py
+3
-2
opencompass/models/llama2.py
opencompass/models/llama2.py
+5
-3
No files found.
configs/models/hf_internlm/hf_internlm_chat_20b.py
View file @
e78857ac
...
@@ -29,5 +29,6 @@ models = [
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
batch_size
=
8
,
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
2
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
2
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
)
]
]
configs/models/hf_internlm/hf_internlm_chat_7b.py
View file @
e78857ac
...
@@ -29,5 +29,6 @@ models = [
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
batch_size
=
8
,
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
)
]
]
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
View file @
e78857ac
...
@@ -29,5 +29,6 @@ models = [
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
batch_size
=
8
,
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
)
]
]
configs/models/qwen/hf_qwen_14b_chat.py
View file @
e78857ac
...
@@ -22,12 +22,14 @@ models = [
...
@@ -22,12 +22,14 @@ models = [
padding_side
=
'left'
,
padding_side
=
'left'
,
truncation_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
use_fast
=
False
,),
use_fast
=
False
,
),
pad_token_id
=
151643
,
pad_token_id
=
151643
,
max_out_len
=
100
,
max_out_len
=
100
,
max_seq_len
=
2048
,
max_seq_len
=
2048
,
batch_size
=
8
,
batch_size
=
8
,
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
)
]
]
configs/models/qwen/hf_qwen_7b_chat.py
View file @
e78857ac
...
@@ -22,12 +22,14 @@ models = [
...
@@ -22,12 +22,14 @@ models = [
padding_side
=
'left'
,
padding_side
=
'left'
,
truncation_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
use_fast
=
False
,),
use_fast
=
False
,
),
pad_token_id
=
151643
,
pad_token_id
=
151643
,
max_out_len
=
100
,
max_out_len
=
100
,
max_seq_len
=
2048
,
max_seq_len
=
2048
,
batch_size
=
8
,
batch_size
=
8
,
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
)
]
]
configs/summarizers/groups/cibench.py
0 → 100644
View file @
e78857ac
_cibench
=
[
'Pandas'
,
'Matplotlib'
,
'Opencv'
,
'SciPy'
,
'Seaborn'
,
'PyTorch'
]
_cibench
=
[
'cibench_'
+
i
for
i
in
_cibench
]
cibench_summary_groups
=
[{
'name'
:
'cibench'
,
'subsets'
:
_cibench
}]
configs/summarizers/groups/mathbench.py
0 → 100644
View file @
e78857ac
mathbench_summary_groups
=
[
{
'name'
:
'mathbench-college'
,
'subsets'
:
[
[
'mathbench-college-single_choice_cn'
,
'acc_1'
],
[
'mathbench-college-cloze_en'
,
'accuracy'
],
]
},
{
'name'
:
'mathbench-high'
,
'subsets'
:
[
[
'mathbench-high-single_choice_cn'
,
'acc_1'
],
[
'mathbench-high-single_choice_en'
,
'acc_1'
],
]
},
{
'name'
:
'mathbench-middle'
,
'subsets'
:
[
[
'mathbench-middle-single_choice_cn'
,
'acc_1'
],
]
},
{
'name'
:
'mathbench-primary'
,
'subsets'
:
[
[
'mathbench-primary-cloze_cn'
,
'accuracy'
],
]
},
{
'name'
:
'mathbench'
,
'subsets'
:
[
'mathbench-college'
,
'mathbench-high'
,
'mathbench-middle'
,
'mathbench-primary'
,
],
},
{
'name'
:
'mathbench-college-circular'
,
'subsets'
:
[
[
'mathbench-college-single_choice_cn'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-high-circular'
,
'subsets'
:
[
[
'mathbench-high-single_choice_cn'
,
'perf_4'
],
[
'mathbench-high-single_choice_en'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-middle-circular'
,
'subsets'
:
[
[
'mathbench-middle-single_choice_cn'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-circular'
,
'subsets'
:
[
'mathbench-college-circular'
,
'mathbench-high-circular'
,
'mathbench-middle-circular'
,
],
},
{
'name'
:
'mathbench-circular-and-cloze'
,
'subsets'
:
[
'mathbench-high-circular'
,
'mathbench-middle-circular'
,
'mathbench-circular'
,
'mathbench-college-cloze_en'
,
'mathbench-primary-cloze_cn'
,
],
}
]
configs/summarizers/math_agent.py
0 → 100644
View file @
e78857ac
summarizer
=
dict
(
dataset_abbrs
=
[
'######## GSM8K-Agent Accuracy ########'
,
# category
[
'gsm8k-agent'
,
'follow_acc'
],
[
'gsm8k-agent'
,
'reasoning_acc'
],
[
'gsm8k-agent'
,
'code_acc'
],
[
'gsm8k-agent'
,
'action_pct'
],
'######## MATH-Agent Accuracy ########'
,
# category
[
'math-agent'
,
'follow_acc'
],
[
'math-agent'
,
'reasoning_acc'
],
[
'math-agent'
,
'code_acc'
],
[
'math-agent'
,
'action_pct'
],
'######## MathBench-Agent Accuracy ########'
,
# category
[
'mathbench-college-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-college-cloze_en-agent'
,
'accuracy'
],
[
'mathbench-high-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-high-single_choice_en-agent'
,
'acc_1'
],
[
'mathbench-middle-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-primary-cloze_cn-agent'
,
'accuracy'
],
'######## MathBench-Agent CircularEval ########'
,
# category
[
'mathbench-college-single_choice_cn-agent'
,
'perf_4'
],
[
'mathbench-high-single_choice_cn-agent'
,
'perf_4'
],
[
'mathbench-high-single_choice_en-agent'
,
'perf_4'
],
[
'mathbench-middle-single_choice_cn-agent'
,
'perf_4'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
opencompass/datasets/cibench.py
View file @
e78857ac
...
@@ -2,13 +2,15 @@ import json
...
@@ -2,13 +2,15 @@ import json
import
os
import
os
import
os.path
as
osp
import
os.path
as
osp
import
re
import
re
import
subprocess
from
collections
import
defaultdict
from
typing
import
List
,
Optional
from
typing
import
List
,
Optional
import
numpy
as
np
import
numpy
as
np
from
datasets
import
Dataset
from
datasets
import
Dataset
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
LOAD_DATASET
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
.base
import
BaseDataset
from
.base
import
BaseDataset
...
@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
...
@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
with
open
(
file
,
'r'
)
as
f
:
with
open
(
file
,
'r'
)
as
f
:
notebook
=
json
.
load
(
f
)
notebook
=
json
.
load
(
f
)
example
=
notebook
[
'cells'
]
example
=
notebook
[
'cells'
]
metadata
=
notebook
[
'metadata'
]
modules
=
metadata
.
get
(
'modules'
,
[])
if
modules
:
# these two annotations should be the same
assert
len
(
modules
)
==
len
(
metadata
.
get
(
'step_types'
))
# reformat annotations
modules
=
[[
_m
.
strip
()
for
_m
in
_modules
.
split
(
'&'
)]
for
_modules
in
modules
]
questions
=
[]
questions
=
[]
source_codes
=
[]
outputs
=
[]
outputs
=
[]
tags
=
[]
tags
=
[]
for
cell
in
example
:
for
cell
in
example
:
if
cell
[
'cell_type'
]
==
'markdown'
:
if
cell
[
'cell_type'
]
==
'markdown'
:
text
=
''
.
join
(
cell
[
'source'
])
text
=
''
.
join
(
cell
[
'source'
]).
strip
()
if
modules
:
_modules
=
modules
.
pop
(
0
)
text
+=
f
"Please use
{
' and '
.
join
(
_modules
)
}
modules."
text
=
text
.
strip
()
+
'
\n
'
# append the formatted text
# append the formatted text
questions
.
append
(
text
)
questions
.
append
(
text
)
elif
cell
[
'cell_type'
]
==
'code'
:
elif
cell
[
'cell_type'
]
==
'code'
:
source_codes
.
append
(
''
.
join
(
cell
[
'source'
]))
if
cell
[
'outputs'
]
and
'data'
in
cell
[
'outputs'
][
-
1
]:
if
cell
[
'outputs'
]
and
'data'
in
cell
[
'outputs'
][
-
1
]:
if
'image/png'
in
cell
[
'outputs'
][
-
1
][
'data'
]:
if
'image/png'
in
cell
[
'outputs'
][
-
1
][
'data'
]:
# skip vis temporarily due to lack of evaluation
# skip vis temporarily due to lack of evaluation
...
@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
...
@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
outputs
.
append
(
''
.
join
(
outputs
.
append
(
''
.
join
(
cell
[
'outputs'
][
-
1
][
'data'
][
'text/plain'
]))
cell
[
'outputs'
][
-
1
][
'data'
][
'text/plain'
]))
else
:
else
:
tags
.
append
(
'exec
utable
'
)
tags
.
append
(
'exec'
)
outputs
.
append
(
None
)
outputs
.
append
(
None
)
return
dict
(
return
dict
(
experiment
=
file
,
experiment
=
file
,
questions
=
sum
(([
questions
=
sum
(([
dict
(
role
=
'user'
,
content
=
question
),
dict
(
role
=
'user'
,
content
=
question
),
dict
(
role
=
'assistant'
,
content
=
output
)
dict
(
role
=
'assistant'
,
content
=
source_code
)
]
for
question
,
output
in
zip
(
questions
,
outputs
)),
[]),
]
for
question
,
source_code
in
zip
(
questions
,
source_codes
)),
[]),
references
=
dict
(
outputs
=
outputs
,
tags
=
tags
,
experiment
=
file
),
references
=
dict
(
outputs
=
outputs
,
tags
=
tags
,
metadata
=
metadata
,
experiment
=
file
),
)
)
...
@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
...
@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
:
str
):
def
load
(
path
:
str
):
"""Load whole dataset."""
"""Load whole dataset."""
assert
os
.
path
.
exists
(
path
),
f
'Path
{
path
}
does not exist.'
data_list
=
[]
data_list
=
[]
for
cwd
,
dirs
,
files
in
os
.
walk
(
path
):
for
cwd
,
dirs
,
files
in
os
.
walk
(
path
):
dirs
.
sort
()
dirs
.
sort
()
...
@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset.
"""Evaluator for CI dataset.
Args:
Args:
text_evaluator (optional, dict): The text evaluator for text result
comparison[]. Defaults to None, which use Rouge as defaults.
Please notice that a extra key for `metric_name` should be set
to get the exact metric result, such as `rouge1`.
output_dir (optional, str): The directory to save experiment
output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format.
files in a markdown or notebook format.
with_ipynb (bool): Generate ipynb correspondingly.
Defaults to False.
user_data_dir (str): The directory to load local files.
user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable
Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir.
`USER_DATA_DIR` to get the data dir.
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
text_evaluator
:
Optional
[
dict
]
=
None
,
output_dir
:
Optional
[
str
]
=
None
,
output_dir
:
Optional
[
str
]
=
None
,
with_ipynb
:
bool
=
False
,
user_data_dir
:
str
=
'ENV'
)
->
None
:
user_data_dir
:
str
=
'ENV'
)
->
None
:
if
text_evaluator
is
None
:
from
opencompass.openicl.icl_evaluator
import
RougeEvaluator
self
.
text_evaluator
=
ICL_EVALUATORS
.
build
(
dict
(
type
=
RougeEvaluator
))
self
.
text_eval_metric
=
'rouge1'
else
:
self
.
text_eval_metric
=
text_evaluator
.
pop
(
'metric_name'
)
self
.
text_evaluator
=
ICL_EVALUATORS
.
build
(
text_evaluator
)
# TODO: should use work dir for this task.
# TODO: should use work dir for this task.
self
.
output_dir
=
output_dir
self
.
output_dir
=
output_dir
self
.
user_data_dir
=
self
.
check_user_data_dir
(
user_data_dir
)
self
.
with_ipynb
=
with_ipynb
self
.
TAG_MAPPING
=
{
'exec'
:
(
'executable'
,
self
.
valid_step
),
'general'
:
(
'general_correct'
,
self
.
correct_step
),
'num'
:
(
'numeric_correct'
,
self
.
correct_step
),
'text'
:
(
'text_score'
,
self
.
text_step
),
'vis'
:
(
'vis_sim'
,
self
.
vis_similarity_step
),
}
def
check_user_data_dir
(
self
,
user_data_dir
):
if
user_data_dir
==
'ENV'
:
if
user_data_dir
==
'ENV'
:
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
self
.
user_data_dir
=
user_data_dir
user_data_dir
=
user_data_dir
.
rstrip
(
'/'
)
basename
=
osp
.
basename
(
user_data_dir
)
if
basename
and
basename
!=
'data'
:
user_data_dir
=
osp
.
join
(
user_data_dir
,
'data'
)
assert
osp
.
exists
(
user_data_dir
),
\
f
'a subfolder named `data` should exist under
{
user_data_dir
}
.'
elif
basename
:
assert
osp
.
exists
(
user_data_dir
),
\
f
'
{
user_data_dir
}
does not exist.'
return
user_data_dir
@
staticmethod
@
staticmethod
def
valid_step
(
step
):
def
valid_step
(
step
):
...
@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
# Fall back to False
# Fall back to False
return
False
return
False
def
text_step
(
self
,
step
,
target
):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for
action
in
step
[::
-
1
]:
if
action
[
'type'
]
==
'IPythonInterpreter'
:
if
action
[
'result'
]:
try
:
pred
=
action
[
'result'
][
'text'
]
match
=
re
.
search
(
'```
\n
(.*?)
\n
```'
,
pred
,
re
.
DOTALL
)
if
match
:
out
=
match
.
group
(
1
)
score
=
self
.
text_evaluator
.
score
([
out
],
[
target
])
return
score
[
self
.
text_eval_metric
]
/
100
except
Exception
:
return
False
# Fall back to False
return
False
@
staticmethod
@
staticmethod
def
vis_similarity_step
(
step
,
target
):
def
vis_similarity_step
(
step
,
target
):
"""Whether the step output image has the same structure similarity with
"""Whether the step output image has the same structure similarity with
...
@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
'the conversion processes.'
)
'the conversion processes.'
)
check_jupytext
()
check_jupytext
()
p_list
=
[]
from
opencompass.lagent.actions.ipython_interpreter
import
extract_code
from
opencompass.lagent.actions.ipython_interpreter
import
extract_code
for
idx
,
(
example_origin_prompt
,
for
idx
,
(
example_origin_prompt
,
example_steps
)
in
enumerate
(
zip
(
origin_prompt
,
steps
)):
example_steps
)
in
enumerate
(
zip
(
origin_prompt
,
steps
)):
...
@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
f
.
writelines
(
markdown_lines
)
f
.
writelines
(
markdown_lines
)
# TODO: be careful for this
# TODO: be careful for this
# The result might be different with infer process
# please check carefully
# convert markdown to ipynb and exectue with error tolerance
# convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen(
if
self
.
with_ipynb
:
# "jupytext --to ipynb --pipe-fmt ipynb "
p
=
subprocess
.
Popen
(
# "--pipe 'jupyter nbconvert --to ipynb --execute "
'jupytext --to ipynb --pipe-fmt ipynb '
# f"--allow-errors --stdin --stdout' {md_file}",
"--pipe 'jupyter nbconvert --to ipynb --execute "
# shell=True)
f
"--allow-errors --stdin --stdout'
{
md_file
}
"
,
shell
=
True
)
p_list
.
append
(
p
)
# TODO: async wait
for
p
in
p_list
:
p
.
wait
()
def
set_data_dir
(
self
,
work_dir
):
def
set_data_dir
(
self
,
work_dir
):
"""Set work directory and link data files for save notebook results."""
"""Set work directory and link data files for save notebook results."""
if
self
.
user_data_dir
:
if
self
.
user_data_dir
:
if
self
.
user_data_dir
.
endswith
(
'/'
):
basename
=
osp
.
basename
(
self
.
user_data_dir
)
basename
=
osp
.
basename
(
osp
.
split
(
self
.
user_data_dir
)[
0
])
else
:
basename
=
osp
.
basename
(
self
.
user_data_dir
)
if
not
osp
.
exists
(
osp
.
join
(
self
.
output_dir
,
basename
)):
if
not
osp
.
exists
(
osp
.
join
(
self
.
output_dir
,
basename
)):
os
.
symlink
(
self
.
user_data_dir
,
os
.
symlink
(
self
.
user_data_dir
,
osp
.
join
(
self
.
output_dir
,
basename
))
osp
.
join
(
self
.
output_dir
,
basename
))
...
@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
"""Change work directory and keep the symlink."""
"""Change work directory and keep the symlink."""
os
.
chdir
(
work_dir
)
os
.
chdir
(
work_dir
)
def
single_exp
(
self
,
gold
,
steps
):
tags
=
gold
[
'tags'
]
outputs
=
gold
[
'outputs'
]
metadata
=
gold
[
'metadata'
]
hard_tags
=
metadata
.
get
(
'step_types'
,
[])
if
hard_tags
:
tags
=
hard_tags
# executable: exec succeed
# general_correct: general correct
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result
=
defaultdict
(
list
)
for
tag
,
step
,
output
in
zip
(
tags
,
steps
,
outputs
):
# check whether this step is valid
result
[
'executable'
].
append
(
self
.
valid_step
(
step
))
if
tag
!=
'exec'
:
key
,
func
=
self
.
TAG_MAPPING
[
tag
]
result
[
key
].
append
(
func
(
step
,
output
))
# add missing metric for better analyse if not exists
if
hard_tags
:
check_tags
=
[
'exec'
,
'num'
,
'text'
,
'vis'
]
else
:
check_tags
=
[
'exec'
,
'general'
,
'vis'
]
for
tag
in
check_tags
:
key
=
self
.
TAG_MAPPING
[
tag
][
0
]
if
key
not
in
result
:
result
[
key
]
=
[]
return
result
def
get_output_dir
(
self
):
"""Get output dir from eval task.
Notice: output dir should be in format xxx/data.
All the needed files should be
"""
# hard hack for get output dir from eval task
if
hasattr
(
self
,
'_out_dir'
)
and
self
.
output_dir
is
None
:
self
.
output_dir
=
self
.
_out_dir
def
score
(
self
,
predictions
:
List
,
references
:
List
,
steps
:
List
,
def
score
(
self
,
predictions
:
List
,
references
:
List
,
steps
:
List
,
origin_prompt
:
List
):
origin_prompt
:
List
):
"""Calculate accuracy."""
"""Calculate accuracy."""
cwd
=
os
.
getcwd
()
cwd
=
os
.
getcwd
()
self
.
get_output_dir
()
if
self
.
output_dir
:
if
self
.
output_dir
:
if
not
osp
.
exists
(
self
.
output_dir
):
if
not
osp
.
exists
(
self
.
output_dir
):
os
.
makedirs
(
self
.
output_dir
)
os
.
makedirs
(
self
.
output_dir
)
...
@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
...
@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
self
.
save_results
(
origin_prompt
,
steps
)
self
.
save_results
(
origin_prompt
,
steps
)
self
.
unset_data_dir
(
cwd
)
self
.
unset_data_dir
(
cwd
)
num_cells_list
=
[]
total_results
=
defaultdict
(
float
)
num_general_list
=
[]
total_scores
=
defaultdict
(
float
)
passed_list
=
[]
total_nums
=
defaultdict
(
int
)
correct_list
=
[]
vis_list
=
[]
for
gold
,
single_steps
in
zip
(
references
,
steps
):
for
gold
,
single_steps
in
zip
(
references
,
steps
):
tags
=
gold
[
'tags'
]
result
=
self
.
single_exp
(
gold
,
single_steps
)
outputs
=
gold
[
'outputs'
]
num_cells
=
len
(
tags
)
num_general
=
sum
([
tag
==
'general'
for
tag
in
tags
])
passed
=
sum
([
self
.
valid_step
(
step
)
for
step
in
single_steps
])
correct
=
0
vis_sim
=
[]
for
tag
,
step
,
output
in
zip
(
tags
,
single_steps
,
outputs
):
if
tag
==
'general'
:
correct
+=
self
.
correct_step
(
step
,
output
)
elif
tag
==
'vis'
:
vis_sim
.
append
(
self
.
vis_similarity_step
(
step
,
output
))
num_cells_list
.
append
(
num_cells
)
num_general_list
.
append
(
num_general
)
passed_list
.
append
(
passed
)
correct_list
.
append
(
correct
)
if
vis_sim
:
vis_list
.
append
(
sum
(
vis_sim
)
/
len
(
vis_sim
))
else
:
vis_list
.
append
(
-
1
)
if
len
([
v
for
v
in
vis_list
if
v
>=
0
])
>
0
:
for
k
,
v
in
result
.
items
():
visualize_similarity
=
sum
([
v
for
v
in
vis_list
if
v
>=
0
])
/
len
(
total_scores
[
k
]
+=
sum
(
v
)
[
v
for
v
in
vis_list
if
v
>=
0
])
total_nums
[
k
]
+=
len
(
v
)
else
:
# not valid
visualize_similarity
=
-
1
if
sum
(
num_general_list
)
>
0
:
for
k
,
v
in
total_scores
.
items
():
general_accuracy
=
sum
(
correct_list
)
/
sum
(
num_general_list
)
if
total_nums
[
k
]
>
0
:
else
:
total_results
[
k
]
=
total_scores
[
k
]
/
total_nums
[
k
]
*
100
# not valid
else
:
general_accuracy
=
-
1
total_results
[
k
]
=
-
1
result
=
dict
(
return
total_results
executable_rate
=
sum
(
passed_list
)
/
sum
(
num_cells_list
)
*
100
,
general_accuracy
=
general_accuracy
*
100
,
visualize_similarity
=
visualize_similarity
*
100
,
num_cells_list
=
num_cells_list
,
num_general_list
=
num_general_list
,
passed_list
=
passed_list
,
correct_list
=
correct_list
,
vis_list
=
vis_list
,
)
return
result
opencompass/datasets/cmnli.py
View file @
e78857ac
...
@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
...
@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
if
line
[
'label'
]
==
'-'
:
continue
data
.
append
(
line
)
data
.
append
(
line
)
return
Dataset
.
from_list
(
data
)
return
Dataset
.
from_list
(
data
)
...
...
opencompass/datasets/ds1000.py
View file @
e78857ac
...
@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
...
@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
return
text
return
text
@
TEXT_POSTPROCESSORS
.
register_module
(
'ds1000_completion'
)
def
ds1000_completion_postprocess
(
text
:
str
)
->
str
:
text
+=
'</code>'
match
=
re
.
search
(
'(.*?)</code>'
,
text
,
re
.
DOTALL
)
if
match
:
text
=
match
.
group
(
1
)
return
text
@
TEXT_POSTPROCESSORS
.
register_module
(
'ds1000_matplotlib'
)
@
TEXT_POSTPROCESSORS
.
register_module
(
'ds1000_matplotlib'
)
def
ds1000_matplotlib_postprocess
(
text
:
str
)
->
str
:
def
ds1000_matplotlib_postprocess
(
text
:
str
)
->
str
:
text
=
ds1000_postprocess
(
text
)
text
=
ds1000_postprocess
(
text
)
...
...
opencompass/datasets/gsm8k.py
View file @
e78857ac
...
@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
...
@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
reasoning_acc
=
100
*
reasoning_acc
=
100
*
(
reasoning_scope
+
final_scope
+
row_reasoning_scope
)
/
total
,
(
reasoning_scope
+
final_scope
+
row_reasoning_scope
)
/
total
,
code_acc
=
100
*
(
code_scope
+
final_scope
)
/
total
,
code_acc
=
100
*
(
code_scope
+
final_scope
)
/
total
,
action_
acc
=
100
*
(
action_scope
+
final_scope
)
/
total
,
action_
pct
=
100
*
(
action_scope
+
final_scope
)
/
total
,
)
)
return
result
return
result
opencompass/datasets/wikibench.py
View file @
e78857ac
...
@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
...
@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
circular_patterns
=
[
'ABCD'
,
'BCDA'
,
'CDAB'
,
'DABC'
]
circular_patterns
=
[
'ABCD'
,
'BCDA'
,
'CDAB'
,
'DABC'
]
data
=
[]
data
=
[]
with
open
(
path
,
'r'
)
as
infile
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
infile
:
for
id
,
line
in
enumerate
(
infile
):
for
id
,
line
in
enumerate
(
infile
):
entry
=
json
.
loads
(
line
)
entry
=
json
.
loads
(
line
)
if
'cloze'
in
name
:
if
'cloze'
in
name
:
...
...
opencompass/datasets/winogrande.py
View file @
e78857ac
...
@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
...
@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
prompt
=
line
[
'sentence'
]
prompt
=
line
[
'sentence'
]
dataset_list
.
append
({
continue_prompt
=
prompt
.
split
(
'_'
)
'opt1'
:
data_item
=
{
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
prompt
.
replace
(
'_'
,
line
[
'option2
'
]
)
,
'answer'
:
line
[
'answer
'
],
'
answer'
:
'
cont'
:
continue_prompt
[
1
]
line
[
'answer'
]
}
}
)
dataset_list
.
append
(
data_item
)
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
return
dataset_list
return
dataset_list
...
@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
...
@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
prompt
=
line
[
'sentence'
]
prompt
=
line
[
'sentence'
]
answer
=
line
[
'answer'
]
answer
=
line
[
'answer'
]
answer
=
' AB'
[
int
(
answer
)]
if
answer
!=
''
else
'NULL'
answer
=
' AB'
[
int
(
answer
)]
if
answer
!=
''
else
'NULL'
dataset_list
.
append
({
data_item
=
{
'opt1'
:
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
'opt2'
:
'answer'
:
answer
,
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
}
'answer'
:
dataset_list
.
append
(
data_item
)
answer
})
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
return
dataset_list
return
dataset_list
opencompass/lagent/actions/ipython_interpreter.py
View file @
e78857ac
...
@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
...
@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
it is disabled. Defaults to None.
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20.
Defaults to 20.
trim_output (int, optional): Max characters restriction of ipython
outputs. If None, do not perform any trim.
TODO: Notice that, this is not token len. Anf trim strategies
might be added later. Defaults to 1024.
user_data_dir (str): Specified the user data directory for files
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
Defaults to `ENV`.
...
@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
...
@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
enable
:
bool
=
True
,
enable
:
bool
=
True
,
disable_description
:
Optional
[
str
]
=
None
,
disable_description
:
Optional
[
str
]
=
None
,
timeout
:
int
=
20
,
timeout
:
int
=
20
,
trim_output
:
Optional
[
int
]
=
1024
,
user_data_dir
:
str
=
'ENV'
)
->
None
:
user_data_dir
:
str
=
'ENV'
)
->
None
:
super
().
__init__
(
description
,
name
,
enable
,
disable_description
)
super
().
__init__
(
description
,
name
,
enable
,
disable_description
)
...
@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
...
@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
if
user_data_dir
:
if
user_data_dir
:
user_data_dir
=
os
.
path
.
dirname
(
user_data_dir
)
#
user_data_dir = os.path.dirname(user_data_dir)
user_data_dir
=
f
"import os
\n
os.chdir('
{
user_data_dir
}
')"
user_data_dir
=
f
"import os
\n
os.chdir('
{
user_data_dir
}
')"
self
.
user_data_dir
=
user_data_dir
self
.
user_data_dir
=
user_data_dir
self
.
_initialized
=
False
self
.
_initialized
=
False
self
.
trim_output
=
trim_output
if
not
os
.
path
.
exists
(
WORK_DIR
):
if
not
os
.
path
.
exists
(
WORK_DIR
):
os
.
mkdir
(
WORK_DIR
)
os
.
mkdir
(
WORK_DIR
)
...
@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
...
@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
if
image
:
if
image
:
result
+=
f
'
\n\n
{
image
}
'
result
+=
f
'
\n\n
{
image
}
'
if
finished
:
if
finished
:
# in case output text too long
# might need better design later
if
self
.
trim_output
and
len
(
result
)
>
self
.
trim_output
:
ellip
=
'......'
half_len
=
int
((
self
.
trim_output
-
len
(
ellip
))
/
2
)
result
=
result
[:
half_len
]
+
ellip
+
result
[
-
half_len
:]
return
succeed
,
result
return
succeed
,
result
try
:
try
:
...
@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
...
@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
command
:
str
,
command
:
str
,
timeout
:
Optional
[
int
]
=
None
)
->
ActionReturn
:
timeout
:
Optional
[
int
]
=
None
)
->
ActionReturn
:
tool_return
=
ActionReturn
(
url
=
None
,
args
=
None
,
type
=
self
.
name
)
tool_return
=
ActionReturn
(
url
=
None
,
args
=
None
,
type
=
self
.
name
)
tool_return
.
args
=
dict
(
text
=
command
)
extracted_command
=
extract_code
(
command
)
succeed
,
result
=
self
.
_call
(
command
,
timeout
)
tool_return
.
args
=
dict
(
text
=
command
,
extract_code
=
extracted_command
)
if
succeed
:
if
extracted_command
:
tool_return
.
result
=
dict
(
text
=
result
)
succeed
,
result
=
self
.
_call
(
extracted_command
,
timeout
)
tool_return
.
state
=
ActionStatusCode
.
SUCCESS
if
succeed
:
if
not
result
:
result
=
'The code is succeed without any outputs.'
tool_return
.
result
=
dict
(
text
=
result
)
tool_return
.
state
=
ActionStatusCode
.
SUCCESS
else
:
tool_return
.
errmsg
=
repr
(
result
)
tool_return
.
state
=
ActionStatusCode
.
API_ERROR
else
:
else
:
tool_return
.
errmsg
=
repr
(
result
)
tool_return
.
errmsg
=
'The input code is empty. Please follow the format.'
# noqa
tool_return
.
state
=
ActionStatusCode
.
API_ERROR
tool_return
.
state
=
ActionStatusCode
.
API_ERROR
return
tool_return
return
tool_return
...
...
opencompass/models/base.py
View file @
e78857ac
...
@@ -115,6 +115,20 @@ class BaseModel:
...
@@ -115,6 +115,20 @@ class BaseModel:
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_ppl
(
inputs
,
mask_length
)
return
self
.
get_ppl
(
inputs
,
mask_length
)
def
get_loglikelihood_from_template
(
self
,
templates
:
List
[
PromptType
],
conts
:
List
[
str
],
mask_length
=
None
):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_loglikelihood
(
inputs
,
conts
,
mask_length
)
def
generate_from_template
(
self
,
templates
:
List
[
PromptType
],
def
generate_from_template
(
self
,
templates
:
List
[
PromptType
],
max_out_len
:
int
,
**
kwargs
):
max_out_len
:
int
,
**
kwargs
):
"""Generate completion from a list of templates.
"""Generate completion from a list of templates.
...
...
opencompass/models/base_api.py
View file @
e78857ac
import
re
import
re
import
sys
import
sys
import
threading
import
threading
import
time
import
warnings
import
warnings
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
copy
import
deepcopy
from
copy
import
deepcopy
from
queue
import
Queue
from
time
import
sleep
from
time
import
sleep
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
...
@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
...
@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
def
__init__
(
self
,
def
__init__
(
self
,
path
:
str
,
path
:
str
,
query_per_second
:
int
=
1
,
query_per_second
:
int
=
1
,
rpm_verbose
:
bool
=
False
,
retry
:
int
=
2
,
retry
:
int
=
2
,
max_seq_len
:
int
=
2048
,
max_seq_len
:
int
=
2048
,
meta_template
:
Optional
[
Dict
]
=
None
,
meta_template
:
Optional
[
Dict
]
=
None
,
...
@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
...
@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
self
.
meta_template
=
meta_template
self
.
meta_template
=
meta_template
self
.
retry
=
retry
self
.
retry
=
retry
self
.
query_per_second
=
query_per_second
self
.
query_per_second
=
query_per_second
self
.
token_bucket
=
TokenBucket
(
query_per_second
)
self
.
token_bucket
=
TokenBucket
(
query_per_second
,
rpm_verbose
)
self
.
template_parser
=
APITemplateParser
(
meta_template
)
self
.
template_parser
=
APITemplateParser
(
meta_template
)
self
.
logger
=
get_logger
()
self
.
logger
=
get_logger
()
self
.
generation_kwargs
=
generation_kwargs
self
.
generation_kwargs
=
generation_kwargs
...
@@ -422,10 +425,13 @@ class TokenBucket:
...
@@ -422,10 +425,13 @@ class TokenBucket:
query_per_second (float): The rate of the token bucket.
query_per_second (float): The rate of the token bucket.
"""
"""
def
__init__
(
self
,
rate
):
def
__init__
(
self
,
rate
,
verbose
=
False
):
self
.
_rate
=
rate
self
.
_rate
=
rate
self
.
_tokens
=
threading
.
Semaphore
(
0
)
self
.
_tokens
=
threading
.
Semaphore
(
0
)
self
.
started
=
False
self
.
started
=
False
self
.
_request_queue
=
Queue
()
self
.
logger
=
get_logger
()
self
.
verbose
=
verbose
def
_add_tokens
(
self
):
def
_add_tokens
(
self
):
"""Add tokens to the bucket."""
"""Add tokens to the bucket."""
...
@@ -440,3 +446,12 @@ class TokenBucket:
...
@@ -440,3 +446,12 @@ class TokenBucket:
self
.
started
=
True
self
.
started
=
True
threading
.
Thread
(
target
=
self
.
_add_tokens
,
daemon
=
True
).
start
()
threading
.
Thread
(
target
=
self
.
_add_tokens
,
daemon
=
True
).
start
()
self
.
_tokens
.
acquire
()
self
.
_tokens
.
acquire
()
if
self
.
verbose
:
cur_time
=
time
.
time
()
while
not
self
.
_request_queue
.
empty
():
if
cur_time
-
self
.
_request_queue
.
queue
[
0
]
>
60
:
self
.
_request_queue
.
get
()
else
:
break
self
.
_request_queue
.
put
(
cur_time
)
self
.
logger
.
info
(
f
'Current RPM
{
self
.
_request_queue
.
qsize
()
}
.'
)
opencompass/models/huggingface.py
View file @
e78857ac
...
@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
...
@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
import
transformers
from
opencompass.models.base
import
BaseModel
from
opencompass.models.base
import
BaseModel
from
opencompass.models.base_api
import
APITemplateParser
from
opencompass.models.base_api
import
APITemplateParser
...
@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
...
@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
PromptType
=
Union
[
PromptList
,
str
]
PromptType
=
Union
[
PromptList
,
str
]
class
MultiTokenEOSCriteria
(
transformers
.
StoppingCriteria
):
"""Criteria to stop on the specified multi-token sequence."""
def
__init__
(
self
,
sequence
:
str
,
tokenizer
:
transformers
.
PreTrainedTokenizer
,
batch_size
:
int
,
):
self
.
done_tracker
=
[
False
]
*
batch_size
self
.
sequence
=
sequence
self
.
sequence_ids
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
self
.
sequence_id_len
=
len
(
self
.
sequence_ids
)
self
.
tokenizer
=
tokenizer
def
__call__
(
self
,
input_ids
,
scores
,
**
kwargs
)
->
bool
:
# compare the last len(stop) tokens
lookback_ids_batch
=
input_ids
[:,
-
self
.
sequence_id_len
:]
lookback_tokens_batch
=
self
.
tokenizer
.
batch_decode
(
lookback_ids_batch
)
for
i
,
done
in
enumerate
(
self
.
done_tracker
):
if
done
:
continue
self
.
done_tracker
[
i
]
=
self
.
sequence
in
lookback_tokens_batch
[
i
]
return
False
not
in
self
.
done_tracker
@
MODELS
.
register_module
()
@
MODELS
.
register_module
()
class
HuggingFace
(
BaseModel
):
class
HuggingFace
(
BaseModel
):
"""Model wrapper around HuggingFace models.
"""Model wrapper around HuggingFace models.
...
@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
...
@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
self
.
model
.
config
.
eos_token_id
=
2
self
.
model
.
config
.
eos_token_id
=
2
self
.
model
.
config
.
pad_token_id
=
self
.
tokenizer
.
pad_token_id
self
.
model
.
config
.
pad_token_id
=
self
.
tokenizer
.
pad_token_id
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
stopping_criteria
:
List
[
str
]
=
[],
**
kwargs
)
->
List
[
str
]:
**
kwargs
)
->
List
[
str
]:
"""Generate results given a list of inputs.
"""Generate results given a list of inputs.
...
@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
...
@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
max_out_len
=
max_out_len
,
max_out_len
=
max_out_len
,
**
generation_kwargs
)
**
generation_kwargs
)
else
:
else
:
return
sum
((
self
.
_single_generate
(
return
sum
(
inputs
=
[
input_
],
max_out_len
=
max_out_len
,
**
generation_kwargs
)
(
self
.
_single_generate
(
inputs
=
[
input_
],
for
input_
in
inputs
),
[])
max_out_len
=
max_out_len
,
stopping_criteria
=
stopping_criteria
,
**
generation_kwargs
)
for
input_
in
inputs
),
[])
def
_batch_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
_batch_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
**
kwargs
)
->
List
[
str
]:
**
kwargs
)
->
List
[
str
]:
...
@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
...
@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
decodeds
=
[
token
.
split
(
self
.
end_str
)[
0
]
for
token
in
decodeds
]
decodeds
=
[
token
.
split
(
self
.
end_str
)[
0
]
for
token
in
decodeds
]
return
decodeds
return
decodeds
def
_single_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
_single_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
stopping_criteria
:
List
[
str
]
=
[],
**
kwargs
)
->
List
[
str
]:
**
kwargs
)
->
List
[
str
]:
"""Support for single prompt inference.
"""Support for single prompt inference.
...
@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
...
@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
max_length
=
self
.
max_seq_len
-
max_length
=
self
.
max_seq_len
-
max_out_len
)[
'input_ids'
]
max_out_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
if
stopping_criteria
:
# Construct huggingface stopping criteria
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
transformers
.
StoppingCriteriaList
([
*
[
MultiTokenEOSCriteria
(
sequence
,
self
.
tokenizer
,
input_ids
.
shape
[
0
])
for
sequence
in
stopping_criteria
],
])
kwargs
[
'stopping_criteria'
]
=
stopping_criteria
# To accommodate the PeftModel, parameters should be passed in
# To accommodate the PeftModel, parameters should be passed in
# key-value format for generate.
# key-value format for generate.
outputs
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
outputs
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
...
@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
...
@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
ce_loss
=
loss
.
sum
(
-
1
).
cpu
().
detach
().
numpy
()
/
lens
ce_loss
=
loss
.
sum
(
-
1
).
cpu
().
detach
().
numpy
()
/
lens
return
ce_loss
return
ce_loss
def
get_loglikelihood
(
self
,
inputs
:
List
[
str
],
conts
:
List
[
str
],
mask_length
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
float
]:
"""Get loglikelihood scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
conts (List[str]): A list of strings: slices after the space.
NOT SUPPORT mask_length YET!
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of loglikelihood scores.
"""
assert
mask_length
is
None
,
'Not support mask_length yet.'
if
self
.
batch_padding
and
len
(
inputs
)
>
1
:
raise
NotImplementedError
(
'Batch padding is not supported yet.'
)
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return
np
.
array
([
self
.
_get_loglikelihood
(
inputs
=
inputs
[
idx
],
conts
=
conts
[
idx
])
for
idx
in
range
(
len
(
inputs
))
])
def
_get_loglikelihood
(
self
,
inputs
:
str
,
conts
:
str
)
->
float
:
"""Get loglikelihood scores given input string and continuation string.
Args:
inputs (str): string.
conts (str): strings: slices after the space.
Returns:
float: loglikelihood scores.
"""
input_ids
=
self
.
tokenizer
(
inputs
,
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
context_ids
=
self
.
tokenizer
(
inputs
.
replace
(
conts
,
''
),
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
cont_ids
=
input_ids
[
len
(
context_ids
):]
output
=
self
.
model
(
input_ids
.
unsqueeze
(
0
))
logits
=
output
[
'logits'
][:,
:
-
1
]
logits
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
)
contlen
=
cont_ids
.
shape
[
0
]
logits
=
logits
[:,
-
contlen
:,
:]
# Reducing the dimension will lead to a wrong outcome
logits_gather
=
torch
.
gather
(
logits
,
2
,
cont_ids
.
unsqueeze
(
0
).
unsqueeze
(
-
1
))
# [1, seq]
# Answer: sum the likelihood of each token in continuation
answer
=
float
(
logits_gather
.
detach
().
cpu
().
sum
())
return
answer
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
"""Get lengths of the tokenized strings.
"""Get lengths of the tokenized strings.
...
@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
...
@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
'role'
:
{
'role'
:
{
'HUMAN'
:
'user'
,
'HUMAN'
:
'user'
,
'BOT'
:
'assistant'
,
'BOT'
:
'assistant'
,
'SYSTEM'
:
'system'
'SYSTEM'
:
'system'
,
}[
item
[
'role'
]]
}[
item
[
'role'
]
.
upper
()
]
}
}
history
.
append
(
msg
)
history
.
append
(
msg
)
user_content
=
history
[
-
1
][
'content'
]
user_content
=
history
[
-
1
][
'content'
]
...
@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
...
@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
user_content
,
user_content
,
history
=
history
)
history
=
history
)
# response will be dict sometime
if
isinstance
(
response
,
dict
):
response
=
response
.
get
(
'content'
,
''
)
responses
.
append
(
response
)
responses
.
append
(
response
)
except
Exception
:
except
Exception
:
responses
.
append
(
''
)
responses
.
append
(
''
)
...
...
opencompass/models/lagent.py
View file @
e78857ac
...
@@ -52,7 +52,7 @@ class LagentAgent:
...
@@ -52,7 +52,7 @@ class LagentAgent:
def
chat
(
self
,
def
chat
(
self
,
user_input
:
str
,
user_input
:
str
,
history
:
List
[
dict
]
=
None
)
->
Tuple
[
str
,
List
[
dict
]]:
history
:
List
[
dict
]
=
None
)
->
Tuple
[
str
,
List
[
dict
],
List
[
dict
]]:
"""Chat with agent."""
"""Chat with agent."""
if
history
:
if
history
:
self
.
agent
.
_session_history
=
history
self
.
agent
.
_session_history
=
history
...
@@ -60,6 +60,7 @@ class LagentAgent:
...
@@ -60,6 +60,7 @@ class LagentAgent:
from
lagent.schema
import
ActionReturn
,
AgentReturn
from
lagent.schema
import
ActionReturn
,
AgentReturn
generation
:
AgentReturn
=
self
.
agent
.
chat
(
user_input
)
generation
:
AgentReturn
=
self
.
agent
.
chat
(
user_input
)
inner_steps
=
generation
.
inner_steps
answer
=
generation
.
response
answer
=
generation
.
response
steps
=
[]
steps
=
[]
...
@@ -76,7 +77,7 @@ class LagentAgent:
...
@@ -76,7 +77,7 @@ class LagentAgent:
valid
=
int
(
step
.
valid
),
valid
=
int
(
step
.
valid
),
))
))
return
answer
,
steps
return
answer
,
steps
,
inner_steps
FORCE_STOP_PROMPT_EN
=
(
FORCE_STOP_PROMPT_EN
=
(
...
...
opencompass/models/llama2.py
View file @
e78857ac
...
@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
...
@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
dialog
=
[]
dialog
=
[]
for
item
in
input
:
for
item
in
input
:
msg
=
{
'content'
:
item
[
'prompt'
]}
msg
=
{
'content'
:
item
[
'prompt'
]}
if
item
[
'role'
]
==
'HUMAN'
:
if
item
[
'role'
]
.
upper
()
==
'HUMAN'
:
msg
[
'role'
]
=
'user'
msg
[
'role'
]
=
'user'
elif
item
[
'role'
]
==
'BOT'
:
elif
item
[
'role'
]
.
upper
()
==
'BOT'
:
msg
[
'role'
]
=
'assistant'
msg
[
'role'
]
=
'assistant'
elif
item
[
'role'
]
==
'SYSTEM'
:
elif
item
[
'role'
]
.
upper
()
==
'SYSTEM'
:
msg
[
'role'
]
=
'system'
msg
[
'role'
]
=
'system'
else
:
raise
ValueError
(
f
'Unknown role:
{
item
[
"role"
]
}
'
)
dialog
.
append
(
msg
)
dialog
.
append
(
msg
)
dialogs
.
append
(
dialog
)
dialogs
.
append
(
dialog
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment