Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
dbb20b82
Unverified
Commit
dbb20b82
authored
Oct 27, 2023
by
Fengzhe Zhou
Committed by
GitHub
Oct 27, 2023
Browse files
[Sync] update (#517)
parent
6f07af30
Changes
45
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
295 additions
and
27 deletions
+295
-27
.gitignore
.gitignore
+4
-1
configs/datasets/agieval/agieval_gen_397d81.py
configs/datasets/agieval/agieval_gen_397d81.py
+204
-0
configs/datasets/agieval/agieval_mixed_2f14ad.py
configs/datasets/agieval/agieval_mixed_2f14ad.py
+2
-2
configs/datasets/bbh/bbh_gen_5b92b0.py
configs/datasets/bbh/bbh_gen_5b92b0.py
+2
-2
configs/datasets/bbh/bbh_gen_5bf00b.py
configs/datasets/bbh/bbh_gen_5bf00b.py
+2
-2
configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
+2
-2
configs/datasets/gsm8k/gsm8k_gen_1dce88.py
configs/datasets/gsm8k/gsm8k_gen_1dce88.py
+2
-2
configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
+2
-2
configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
+2
-2
configs/models/claude/claude.py
configs/models/claude/claude.py
+5
-1
docs/en/user_guides/experimentation.md
docs/en/user_guides/experimentation.md
+2
-1
docs/zh_cn/user_guides/experimentation.md
docs/zh_cn/user_guides/experimentation.md
+2
-1
opencompass/datasets/afqmcd.py
opencompass/datasets/afqmcd.py
+1
-1
opencompass/datasets/agieval/agieval.py
opencompass/datasets/agieval/agieval.py
+28
-1
opencompass/datasets/bbh.py
opencompass/datasets/bbh.py
+28
-1
opencompass/datasets/bustum.py
opencompass/datasets/bustum.py
+1
-1
opencompass/datasets/c3.py
opencompass/datasets/c3.py
+2
-2
opencompass/datasets/ceval.py
opencompass/datasets/ceval.py
+2
-1
opencompass/datasets/chid.py
opencompass/datasets/chid.py
+1
-1
opencompass/datasets/cluewsc.py
opencompass/datasets/cluewsc.py
+1
-1
No files found.
.gitignore
View file @
dbb20b82
...
@@ -11,7 +11,7 @@ configs/eval_debug*.py
...
@@ -11,7 +11,7 @@ configs/eval_debug*.py
configs/viz_*.py
configs/viz_*.py
data
data
work_dirs
work_dirs
configs/internal/
# Byte-compiled / optimized / DLL files
# Byte-compiled / optimized / DLL files
__pycache__/
__pycache__/
*.py[cod]
*.py[cod]
...
@@ -86,3 +86,6 @@ docs/zh_cn/_build/
...
@@ -86,3 +86,6 @@ docs/zh_cn/_build/
# .zip
# .zip
*.zip
*.zip
# sft config ignore list
configs/sft_cfg/*B_*
configs/datasets/agieval/agieval_gen_397d81.py
0 → 100644
View file @
dbb20b82
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
AGIEvalDataset_v2
,
AGIEvalEvaluator
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess
,
first_capital_postprocess_multi
agieval_reader_cfg
=
dict
(
input_columns
=
[
'question'
,
'options'
],
output_column
=
'label'
)
agieval_single_choice_sets
=
[
'gaokao-chinese'
,
'gaokao-english'
,
'gaokao-geography'
,
'gaokao-history'
,
'gaokao-biology'
,
'gaokao-chemistry'
,
'gaokao-physics'
,
'gaokao-mathqa'
,
'logiqa-zh'
,
'lsat-ar'
,
'lsat-lr'
,
'lsat-rc'
,
'logiqa-en'
,
'sat-math'
,
'sat-en'
,
'sat-en-without-passage'
,
'aqua-rat'
,
]
agieval_multiple_choices_sets
=
[
'jec-qa-kd'
,
'jec-qa-ca'
,
]
agieval_cloze_sets
=
[
'gaokao-mathcloze'
,
'math'
]
agieval_chinese_sets
=
[
'gaokao-chinese'
,
'gaokao-english'
,
'gaokao-geography'
,
'gaokao-history'
,
'gaokao-biology'
,
'gaokao-chemistry'
,
'gaokao-physics'
,
'gaokao-mathqa'
,
'logiqa-zh'
,
'gaokao-mathcloze'
,
]
agieval_english_sets
=
[
'lsat-ar'
,
'lsat-lr'
,
'lsat-rc'
,
'logiqa-en'
,
'sat-math'
,
'sat-en'
,
'sat-en-without-passage'
,
'aqua-rat'
,
'math'
,
]
agieval_gaokao_sets
=
[
'gaokao-chinese'
,
'gaokao-english'
,
'gaokao-geography'
,
'gaokao-history'
,
'gaokao-biology'
,
'gaokao-chemistry'
,
'gaokao-physics'
,
'gaokao-mathqa'
,
]
agieval_datasets
=
[]
for
_name
in
agieval_single_choice_sets
:
if
_name
in
agieval_chinese_sets
:
_hint
=
'答案是: '
else
:
_hint
=
'The answer is '
agieval_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
[
dict
(
role
=
'HUMAN'
,
prompt
=
f
'{{question}}
\n
{{options}}
\n
{
_hint
}
'
)
])),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
1024
))
agieval_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess
))
agieval_datasets
.
append
(
dict
(
type
=
AGIEvalDataset_v2
,
path
=
'./data/AGIEval/data/v1/'
,
name
=
_name
,
abbr
=
'agieval-'
+
_name
,
setting_name
=
'zero-shot'
,
reader_cfg
=
agieval_reader_cfg
,
infer_cfg
=
agieval_infer_cfg
.
copy
(),
eval_cfg
=
agieval_eval_cfg
.
copy
()))
for
_name
in
agieval_multiple_choices_sets
:
if
_name
in
agieval_chinese_sets
:
_hint
=
'答案是: '
else
:
_hint
=
'The answer is '
agieval_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
[
dict
(
role
=
'HUMAN'
,
prompt
=
f
'{{question}}
\n
{{options}}
\n
{
_hint
}
'
)
])),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
1024
))
agieval_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess_multi
))
agieval_datasets
.
append
(
dict
(
type
=
AGIEvalDataset_v2
,
path
=
'./data/AGIEval/data/v1/'
,
name
=
_name
,
abbr
=
'agieval-'
+
_name
,
setting_name
=
'zero-shot'
,
reader_cfg
=
agieval_reader_cfg
,
infer_cfg
=
agieval_infer_cfg
.
copy
(),
eval_cfg
=
agieval_eval_cfg
.
copy
()))
for
_name
in
agieval_cloze_sets
:
if
_name
in
agieval_chinese_sets
:
_hint
=
'答案是: '
else
:
_hint
=
'The answer is '
agieval_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
[
dict
(
role
=
'HUMAN'
,
prompt
=
f
'{{question}}
\n
{
_hint
}
'
)])),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
1024
))
agieval_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AGIEvalEvaluator
))
agieval_datasets
.
append
(
dict
(
type
=
AGIEvalDataset_v2
,
path
=
'./data/AGIEval/data/v1/'
,
name
=
_name
,
abbr
=
'agieval-'
+
_name
,
setting_name
=
'zero-shot'
,
reader_cfg
=
agieval_reader_cfg
,
infer_cfg
=
agieval_infer_cfg
.
copy
(),
eval_cfg
=
agieval_eval_cfg
.
copy
()))
for
_item
in
agieval_datasets
:
_name
=
_item
[
'name'
]
_intro
=
{
'gaokao-chinese'
:
'以下是一道中国高考语文选择题,请选择正确的答案。'
,
'gaokao-english'
:
'以下是一道中国高考英语选择题,请选择正确的答案。'
,
'gaokao-geography'
:
'以下是一道中国高考地理选择题,请选择正确的答案。'
,
'gaokao-history'
:
'以下是一道中国高考历史选择题,请选择正确的答案。'
,
'gaokao-biology'
:
'以下是一道中国高考生物选择题,请选择正确的答案。'
,
'gaokao-chemistry'
:
'以下是一道中国高考化学选择题,请选择正确的答案。'
,
'gaokao-physics'
:
'以下是一道中国高考物理选择题,请选择正确的答案。'
,
'gaokao-mathqa'
:
'以下是一道中国高考数学选择题,请选择正确的答案。'
,
'logiqa-zh'
:
'以下是一道中国公务员考试题,请选择正确的答案。'
,
'lsat-ar'
:
'The following is a LSAT Analytical Reasoning question. Please select the correct answer.'
,
'lsat-lr'
:
'The following is a LSAT Logical Reasoning question. Please select the correct answer.'
,
'lsat-rc'
:
'The following is a LSAT Reading Comprehension question. Please select the correct answer.'
,
'logiqa-en'
:
'The following is a Logic Reasoning question. Please select the correct answer.'
,
'sat-math'
:
'The following is a SAT Math question. Please select the correct answer.'
,
'sat-en'
:
'The following is a SAT English question. Please select the correct answer.'
,
'sat-en-without-passage'
:
'The following is a SAT English question. Please select the correct answer.'
,
'aqua-rat'
:
'The following is a AQUA-RAT question. Please select the correct answer.'
,
'jec-qa-kd'
:
'以下是一道中国司法考试基础知识题,请选择正确的答案。'
,
'jec-qa-ca'
:
'以下是一道中国司法考试案例分析题,请选择正确的答案。'
,
'gaokao-mathcloze'
:
'以下是一道中国高考数学填空题,请填入正确的答案。'
,
'math'
:
'The following is a Math question. Please select the correct answer.'
,
}[
_name
]
_templates
=
_item
[
'infer_cfg'
][
'prompt_template'
][
'template'
]
_templates
[
'round'
][
0
][
'prompt'
]
=
_intro
+
'
\n
'
+
_templates
[
'round'
][
0
][
'prompt'
]
del
_item
,
_intro
,
_templates
,
_name
,
_hint
,
agieval_infer_cfg
,
agieval_eval_cfg
configs/datasets/agieval/agieval_mixed_2f14ad.py
View file @
dbb20b82
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
,
GenInferencer
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
,
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
AGIEvalDataset_v2
,
AGIEvalEvaluator
from
opencompass.datasets
import
AGIEvalDataset_v2
,
AGIEvalEvaluator
,
AGIEvalEvaluator_mcq
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess_multi
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess_multi
agieval_single_choice_sets
=
[
agieval_single_choice_sets
=
[
...
@@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
...
@@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
1024
))
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
1024
))
agieval_eval_cfg
=
dict
(
agieval_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
A
cc
Evaluator
),
evaluator
=
dict
(
type
=
A
GIEval
Evaluator
_mcq
),
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess_multi
))
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess_multi
))
agieval_datasets
.
append
(
agieval_datasets
.
append
(
...
...
configs/datasets/bbh/bbh_gen_5b92b0.py
View file @
dbb20b82
...
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
BBHDataset
,
BBHEvaluator
,
bbh_mcq_postprocess
from
opencompass.datasets
import
BBHDataset
,
BBHEvaluator
,
bbh_mcq_postprocess
,
BBHEvaluator_mcq
bbh_reader_cfg
=
dict
(
input_columns
=
[
"input"
],
output_column
=
"target"
)
bbh_reader_cfg
=
dict
(
input_columns
=
[
"input"
],
output_column
=
"target"
)
...
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
...
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever
=
dict
(
type
=
ZeroRetriever
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
bbh_eval_cfg
=
dict
(
bbh_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
evaluator
=
dict
(
type
=
BBH
Evaluator
_mcq
),
pred_role
=
"BOT"
,
pred_role
=
"BOT"
,
pred_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
),
pred_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
),
dataset_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
))
dataset_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
))
...
...
configs/datasets/bbh/bbh_gen_5bf00b.py
View file @
dbb20b82
...
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
BBHDataset
,
BBHEvaluator
,
bbh_mcq_postprocess
from
opencompass.datasets
import
BBHDataset
,
BBHEvaluator
,
bbh_mcq_postprocess
,
BBHEvaluator_mcq
bbh_reader_cfg
=
dict
(
input_columns
=
[
"input"
],
output_column
=
"target"
)
bbh_reader_cfg
=
dict
(
input_columns
=
[
"input"
],
output_column
=
"target"
)
...
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
...
@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever
=
dict
(
type
=
ZeroRetriever
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
bbh_eval_cfg
=
dict
(
bbh_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
evaluator
=
dict
(
type
=
BBH
Evaluator
_mcq
),
pred_role
=
"BOT"
,
pred_role
=
"BOT"
,
pred_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
),
pred_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
),
dataset_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
))
dataset_postprocessor
=
dict
(
type
=
bbh_mcq_postprocess
))
...
...
configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py
View file @
dbb20b82
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
,
Gsm8kEvaluator
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
...
@@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
...
@@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
retriever
=
dict
(
type
=
ZeroRetriever
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Gsm8k
Evaluator
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
...
...
configs/datasets/gsm8k/gsm8k_gen_1dce88.py
View file @
dbb20b82
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
,
Gsm8kEvaluator
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
...
@@ -72,7 +72,7 @@ Question: {question}{answer}
...
@@ -72,7 +72,7 @@ Question: {question}{answer}
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
inferencer
=
dict
(
type
=
GenInferencer
,
max_out_len
=
512
))
gsm8k_eval_cfg
=
dict
(
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
evaluator
=
dict
(
type
=
Gsm8k
Evaluator
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
...
...
configs/datasets/gsm8k/gsm8k_gen_a3e34a.py
View file @
dbb20b82
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
SCInferencer
from
opencompass.openicl.icl_inferencer
import
SCInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
,
Gsm8kEvaluator
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
generation_kwargs
=
dict
(
do_sample
=
True
,
temperature
=
0.7
,
top_k
=
40
)
generation_kwargs
=
dict
(
do_sample
=
True
,
temperature
=
0.7
,
top_k
=
40
)
...
@@ -73,7 +73,7 @@ Question: {question}{answer}
...
@@ -73,7 +73,7 @@ Question: {question}{answer}
inferencer
=
dict
(
type
=
SCInferencer
,
max_out_len
=
512
,
generation_kwargs
=
generation_kwargs
,
infer_type
=
'sc'
,
sc_size
=
20
))
inferencer
=
dict
(
type
=
SCInferencer
,
max_out_len
=
512
,
generation_kwargs
=
generation_kwargs
,
infer_type
=
'sc'
,
sc_size
=
20
))
gsm8k_eval_cfg
=
dict
(
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
evaluator
=
dict
(
type
=
Gsm8k
Evaluator
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
),
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
),
sc_size
=
20
)
sc_size
=
20
)
...
...
configs/datasets/gsm8k/gsm8k_gen_e9e91e.py
View file @
dbb20b82
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
...
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
from
opencompass.datasets
import
HFDataset
,
gsm8k_postprocess
,
gsm8k_dataset_postprocess
,
Gsm8kEvaluator
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
gsm8k_reader_cfg
=
dict
(
input_columns
=
[
'question'
],
output_column
=
'answer'
)
...
@@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
...
@@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
retriever
=
dict
(
type
=
ZeroRetriever
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
))
inferencer
=
dict
(
type
=
GenInferencer
))
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
Evaluator
),
gsm8k_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Gsm8k
Evaluator
),
pred_role
=
"BOT"
,
pred_role
=
"BOT"
,
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
pred_postprocessor
=
dict
(
type
=
gsm8k_postprocess
),
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
dataset_postprocessor
=
dict
(
type
=
gsm8k_dataset_postprocess
))
...
...
configs/models/claude/claude.py
View file @
dbb20b82
from
opencompass.models.claude_api.claude_api
import
Claude
from
opencompass.models.claude_api.claude_api
import
Claude
from
opencompass.models.claude_api.postprocessors
import
(
flores_postprocess
,
gsm8k_postprocess
,
humaneval_postprocess
,
lcsts_postprocess
,
mbpp_postprocess
,
strategyqa_pred_postprocess
)
from
opencompass.utils.text_postprocessors
import
last_option_postprocess
from
opencompass.utils.text_postprocessors
import
last_option_postprocess
from
opencompass.models.claude_api.postprocessors
import
gsm8k_postprocess
,
humaneval_postprocess
,
lcsts_postprocess
,
mbpp_postprocess
,
strategyqa_pred_postprocess
agieval_single_choice_sets
=
[
agieval_single_choice_sets
=
[
'gaokao-chinese'
,
'gaokao-chinese'
,
...
@@ -47,6 +49,8 @@ claude_postprocessors = {
...
@@ -47,6 +49,8 @@ claude_postprocessors = {
'lcsts'
:
dict
(
type
=
lcsts_postprocess
),
'lcsts'
:
dict
(
type
=
lcsts_postprocess
),
'mbpp'
:
dict
(
type
=
mbpp_postprocess
),
'mbpp'
:
dict
(
type
=
mbpp_postprocess
),
'strategyqa'
:
dict
(
type
=
strategyqa_pred_postprocess
),
'strategyqa'
:
dict
(
type
=
strategyqa_pred_postprocess
),
'commonsense_qa'
:
dict
(
type
=
last_option_postprocess
,
options
=
'ABCDE'
),
'flores_100_*-zho_simpl'
:
dict
(
type
=
flores_postprocess
),
}
}
for
_name
in
agieval_multiple_choices_sets
+
agieval_single_choice_sets
:
for
_name
in
agieval_multiple_choices_sets
+
agieval_single_choice_sets
:
...
...
docs/en/user_guides/experimentation.md
View file @
dbb20b82
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
The program entry for the evaluation task is
`run.py`
. The usage is as follows:
The program entry for the evaluation task is
`run.py`
. The usage is as follows:
```
shell
```
shell
python run.py
$EXP
{
--slurm
|
--dlc
| None
}
[
-p
PARTITION]
[
-q
QUOTATYPE]
[
--debug
]
[
-m
MODE]
[
-r
[
REUSE]]
[
-w
WORKDIR]
[
-l
]
[
--dry-run
]
python run.py
$EXP
{
--slurm
|
--dlc
| None
}
[
-p
PARTITION]
[
-q
QUOTATYPE]
[
--debug
]
[
-m
MODE]
[
-r
[
REUSE]]
[
-w
WORKDIR]
[
-l
]
[
--dry-run
]
[
--dump-eval-details
]
```
```
Task Configuration (
`$EXP`
):
Task Configuration (
`$EXP`
):
...
@@ -66,6 +66,7 @@ The parameter explanation is as follows:
...
@@ -66,6 +66,7 @@ The parameter explanation is as follows:
-
`-w`
: Specify the working path, default is
`./outputs/default`
.
-
`-w`
: Specify the working path, default is
`./outputs/default`
.
-
`-l`
: Enable status reporting via Lark bot.
-
`-l`
: Enable status reporting via Lark bot.
-
`--dry-run`
: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
-
`--dry-run`
: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
-
`--dump-eval-details`
: When enabled,evaluation under the
`results`
folder will include more details, such as the correctness of each sample.
Using run mode
`-m all`
as an example, the overall execution flow is as follows:
Using run mode
`-m all`
as an example, the overall execution flow is as follows:
...
...
docs/zh_cn/user_guides/experimentation.md
View file @
dbb20b82
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
评测任务的程序入口为
`run.py`
,使用方法如下:
评测任务的程序入口为
`run.py`
,使用方法如下:
```
shell
```
shell
python run.py
$EXP
{
--slurm
|
--dlc
| None
}
[
-p
PARTITION]
[
-q
QUOTATYPE]
[
--debug
]
[
-m
MODE]
[
-r
[
REUSE]]
[
-w
WORKDIR]
[
-l
]
[
--dry-run
]
python run.py
$EXP
{
--slurm
|
--dlc
| None
}
[
-p
PARTITION]
[
-q
QUOTATYPE]
[
--debug
]
[
-m
MODE]
[
-r
[
REUSE]]
[
-w
WORKDIR]
[
-l
]
[
--dry-run
]
[
--dump-eval-details
]
```
```
任务配置 (
`$EXP`
):
任务配置 (
`$EXP`
):
...
@@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
...
@@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
-
`-w`
: 指定工作路径,默认为
`./outputs/default`
-
`-w`
: 指定工作路径,默认为
`./outputs/default`
-
`-l`
: 打开飞书机器人状态上报。
-
`-l`
: 打开飞书机器人状态上报。
-
`--dry-run`
: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
-
`--dry-run`
: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
-
`--dump-eval-details`
: 开启时,
`results`
下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
以运行模式
`-m all`
为例,整体运行流如下:
以运行模式
`-m all`
为例,整体运行流如下:
...
...
opencompass/datasets/afqmcd.py
View file @
dbb20b82
...
@@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
...
@@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
):
def
load
(
path
):
data
=
[]
data
=
[]
with
open
(
path
,
'r'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
line
[
'label'
]
=
'AB'
[
int
(
line
[
'label'
])]
line
[
'label'
]
=
'AB'
[
int
(
line
[
'label'
])]
...
...
opencompass/datasets/agieval/agieval.py
View file @
dbb20b82
...
@@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
...
@@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
def
score
(
self
,
predictions
,
references
):
def
score
(
self
,
predictions
,
references
):
predictions
=
[
parse_math_answer
(
''
,
pred
)
for
pred
in
predictions
]
predictions
=
[
parse_math_answer
(
''
,
pred
)
for
pred
in
predictions
]
details
=
[]
cnt
=
0
cnt
=
0
for
pred
,
ref
in
zip
(
predictions
,
references
):
for
pred
,
ref
in
zip
(
predictions
,
references
):
detail
=
{
'pred'
:
pred
,
'answer'
:
ref
,
'correct'
:
False
}
if
is_equiv
(
pred
,
ref
):
if
is_equiv
(
pred
,
ref
):
cnt
+=
1
cnt
+=
1
detail
[
'correct'
]
=
True
details
.
append
(
detail
)
score
=
cnt
/
len
(
predictions
)
*
100
score
=
cnt
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
return
{
'score'
:
score
,
'details'
:
details
}
@
ICL_EVALUATORS
.
register_module
()
class
AGIEvalEvaluator_mcq
(
BaseEvaluator
):
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
details
=
[]
cnt
=
0
for
pred
,
ref
in
zip
(
predictions
,
references
):
detail
=
{
'pred'
:
pred
,
'answer'
:
ref
,
'correct'
:
False
}
if
pred
==
ref
:
cnt
+=
1
detail
[
'correct'
]
=
True
details
.
append
(
detail
)
score
=
cnt
/
len
(
predictions
)
*
100
return
{
'score'
:
score
,
'details'
:
details
}
opencompass/datasets/bbh.py
View file @
dbb20b82
...
@@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
...
@@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
predictions
=
[
bbh_freeform_postprocess
(
pred
)
for
pred
in
predictions
]
predictions
=
[
bbh_freeform_postprocess
(
pred
)
for
pred
in
predictions
]
details
=
[]
cnt
=
0
cnt
=
0
for
pred
,
ref
in
zip
(
predictions
,
references
):
for
pred
,
ref
in
zip
(
predictions
,
references
):
detail
=
{
'pred'
:
pred
,
'answer'
:
ref
,
'correct'
:
False
}
if
pred
==
ref
:
if
pred
==
ref
:
cnt
+=
1
cnt
+=
1
detail
[
'correct'
]
=
True
details
.
append
(
detail
)
score
=
cnt
/
len
(
predictions
)
*
100
score
=
cnt
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
return
{
'score'
:
score
,
'details'
:
details
}
@
ICL_EVALUATORS
.
register_module
()
class
BBHEvaluator_mcq
(
BaseEvaluator
):
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
details
=
[]
cnt
=
0
for
pred
,
ref
in
zip
(
predictions
,
references
):
detail
=
{
'pred'
:
pred
,
'answer'
:
ref
,
'correct'
:
False
}
if
pred
==
ref
:
cnt
+=
1
detail
[
'correct'
]
=
True
details
.
append
(
detail
)
score
=
cnt
/
len
(
predictions
)
*
100
return
{
'score'
:
score
,
'details'
:
details
}
opencompass/datasets/bustum.py
View file @
dbb20b82
...
@@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
...
@@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
):
def
load
(
path
):
data
=
[]
data
=
[]
with
open
(
path
,
'r'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
line
[
'label'
]
=
'AB'
[
int
(
line
[
'label'
])]
line
[
'label'
]
=
'AB'
[
int
(
line
[
'label'
])]
...
...
opencompass/datasets/c3.py
View file @
dbb20b82
...
@@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
...
@@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
:
str
):
def
load
(
path
:
str
):
with
open
(
path
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
data
=
json
.
load
(
f
)
data
=
json
.
load
(
f
)
rows
=
[]
rows
=
[]
for
_
,
row
in
enumerate
(
data
):
for
_
,
row
in
enumerate
(
data
):
...
@@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
...
@@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
:
str
):
def
load
(
path
:
str
):
with
open
(
path
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
raw
=
json
.
load
(
f
)
raw
=
json
.
load
(
f
)
data
=
[]
data
=
[]
for
line
in
raw
:
for
line
in
raw
:
...
...
opencompass/datasets/ceval.py
View file @
dbb20b82
...
@@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
...
@@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
def
load
(
path
:
str
,
name
:
str
):
def
load
(
path
:
str
,
name
:
str
):
dataset
=
{}
dataset
=
{}
for
split
in
[
'dev'
,
'val'
,
'test'
]:
for
split
in
[
'dev'
,
'val'
,
'test'
]:
with
open
(
osp
.
join
(
path
,
split
,
f
'
{
name
}
_
{
split
}
.csv'
))
as
f
:
filename
=
osp
.
join
(
path
,
split
,
f
'
{
name
}
_
{
split
}
.csv'
)
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
reader
=
csv
.
reader
(
f
)
reader
=
csv
.
reader
(
f
)
header
=
next
(
reader
)
header
=
next
(
reader
)
for
row
in
reader
:
for
row
in
reader
:
...
...
opencompass/datasets/chid.py
View file @
dbb20b82
...
@@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
...
@@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
):
def
load
(
path
):
data
=
[]
data
=
[]
with
open
(
path
,
'r'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
item
=
{}
item
=
{}
...
...
opencompass/datasets/cluewsc.py
View file @
dbb20b82
...
@@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
...
@@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
@
staticmethod
@
staticmethod
def
load
(
path
):
def
load
(
path
):
data
=
[]
data
=
[]
with
open
(
path
,
'r'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
line
=
json
.
loads
(
line
)
item
=
{
item
=
{
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment