Unverified Commit aa2dd2b5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Format] Add config lints (#892)

parent 3dbba119
......@@ -10,32 +10,32 @@ with read_base():
GaokaoBench_datasets = []
for folder, prompts in [
("Multiple-choice_Questions", MCQ_prompts),
("Fill-in-the-blank_Questions", FBQ_prompts),
('Multiple-choice_Questions', MCQ_prompts),
('Fill-in-the-blank_Questions', FBQ_prompts),
]:
for p in prompts:
reader_cfg = {
"input_columns": ["question"],
"output_column": "answer",
'input_columns': ['question'],
'output_column': 'answer',
}
infer_cfg = {
"prompt_template": {
"type": PromptTemplate,
"template": p["prefix_prompt"] + "{question}",
'prompt_template': {
'type': PromptTemplate,
'template': p['prefix_prompt'] + '{question}',
},
"retriever": {"type": ZeroRetriever},
"inferencer": {"type": GenInferencer, "max_out_len": 1024},
'retriever': {'type': ZeroRetriever},
'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
}
eval_cfg = {
"evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
"pred_role": "BOT",
'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
'pred_role': 'BOT',
}
dataset = {
"type": GaokaoBenchDataset,
"abbr": "GaokaoBench_" + p["keyword"],
"path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
"reader_cfg": reader_cfg,
"infer_cfg": infer_cfg,
"eval_cfg": eval_cfg,
'type': GaokaoBenchDataset,
'abbr': 'GaokaoBench_' + p['keyword'],
'path': os.path.join('data', 'GAOKAO-BENCH', 'data', folder, p['keyword'] + '.json'),
'reader_cfg': reader_cfg,
'infer_cfg': infer_cfg,
'eval_cfg': eval_cfg,
}
GaokaoBench_datasets.append(dataset)
......@@ -3,7 +3,7 @@
The following introduction comes from the abstract in [Instruction Following Evaluation for Large Language Models](arxiv.org/abs/2311.07911)
```
IFEval is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of "verifiable instructions" such as "write in more than 400 words" and "mention the keyword of AI at least 3 times". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions.
IFEval is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of "verifiable instructions" such as "write in more than 400 words" and "mention the keyword of AI at least 3 times". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions.
```
## Official link
......@@ -52,4 +52,4 @@ IFEval 3321a3 Inst-level-loose-accuracy gen 48.
journal={arXiv preprint arXiv:2311.07911},
year={2023}
}
```
\ No newline at end of file
```
from mmengine.config import read_base
with read_base():
from .IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, F403
\ No newline at end of file
from .IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, F403
......@@ -9,8 +9,8 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
......@@ -19,7 +19,7 @@ MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفل
MMLUArabic_datasets = []
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
_system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}"
_hint = "\n{input}"
_hint = '\n{input}'
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
......@@ -30,12 +30,12 @@ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
],
round=[
dict(
role="HUMAN",
prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
role='HUMAN',
prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
),
dict(role="BOT", prompt="إجابة: {target}")
dict(role='BOT', prompt='إجابة: {target}')
]),
ice_token="</E>",
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
......@@ -47,9 +47,9 @@ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
abbr=f'acegpt_MMLUArabic_{_name}',
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
path='./data/MMLUArabic/',
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
......
......@@ -8,8 +8,8 @@ from opencompass.datasets import MMLUArabicDataset
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
......@@ -20,16 +20,16 @@ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
# _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n"
# question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
question_overall = "سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
question_overall = 'سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={opt: f"{question_overall}\nإجابة: {opt}\n" for opt in ["A", "B", "C", "D"]},
template={opt: f'{question_overall}\nإجابة: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={opt: f"{_hint}</E>{question_overall}\nإجابة: {opt}" for opt in ["A", "B", "C", "D"]},
ice_token="</E>",
template={opt: f'{_hint}</E>{question_overall}\nإجابة: {opt}' for opt in ['A', 'B', 'C', 'D']},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
......@@ -39,9 +39,9 @@ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
abbr=f'acegpt_MMLUArabic_{_name}',
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
path='./data/MMLUArabic/',
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
......
......@@ -9,8 +9,8 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
MMLUArabic_reader_cfg = dict(
input_columns=["input", "A", "B", "C", "D"],
output_column="target",
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
......@@ -18,18 +18,18 @@ MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفل
MMLUArabic_datasets = []
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + "{input}\n" + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + '{input}\n' + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
MMLUArabic_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=_hint.format(input="سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}")
role='HUMAN',
prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
),
]),
ice_token="</E>",
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
......@@ -41,9 +41,9 @@ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
MMLUArabic_datasets.append(
dict(
abbr=f"acegpt_MMLUArabic_{_name}",
abbr=f'acegpt_MMLUArabic_{_name}',
type=MMLUArabicDataset,
path="./data/MMLUArabic/",
path='./data/MMLUArabic/',
name=_name,
reader_cfg=MMLUArabic_reader_cfg,
infer_cfg=MMLUArabic_infer_cfg,
......
......@@ -10,17 +10,17 @@ Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/mai
val_ds = load_dataset("MMLUArabic", header=None)['validation']
test_ds = load_dataset("MMLUArabic", header=None)['test']
# input, option_a, option_b, option_c, option_d, target
print(next(iter(val_ds)))
print(next(iter(val_ds)))
```
## Citation
```
@misc{huang2023acegpt,
title={AceGPT, Localizing Large Language Models in Arabic},
title={AceGPT, Localizing Large Language Models in Arabic},
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
year={2023},
eprint={2309.12053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
\ No newline at end of file
```
......@@ -6,14 +6,14 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
"cloze_cn": [
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
......@@ -33,7 +33,7 @@ cloze_prompts = {
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
......@@ -78,10 +78,10 @@ for _split in list(mathbench_sets.keys()):
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
role='HUMAN',
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
......@@ -94,14 +94,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -9,7 +9,7 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
PROMPT_EN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
'FEWSHOT_INSTRUCTION_CLOZE' : [
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
......@@ -27,7 +27,7 @@ PROMPT_EN = {
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
'FEWSHOT_INSTRUCTION_CHOICE' : [
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
......@@ -48,7 +48,7 @@ PROMPT_EN = {
}
PROMPT_CN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
'FEWSHOT_INSTRUCTION_CLOZE' : [
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球,8个3分球和10个罚球。他们的对手得到2分球的两倍,但3分球和罚球的一半。两队得分的总和是多少?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
......@@ -66,7 +66,7 @@ PROMPT_CN = {
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
'FEWSHOT_INSTRUCTION_CHOICE' : [
dict(role='HUMAN', prompt='已知点P(-1,4)在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上,则k的值是____'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
......@@ -105,7 +105,7 @@ for _split in list(mathbench_sets.keys()):
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate,
template=dict(
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
round = prompt_example['FEWSHOT_INSTRUCTION_CLOZE'] if 'cloze' in _name else prompt_example['FEWSHOT_INSTRUCTION_CHOICE'])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer)
)
......@@ -116,14 +116,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
abbr='mathbench-' + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -6,14 +6,14 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
"cloze_cn": [
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
......@@ -33,7 +33,7 @@ cloze_prompts = {
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
......@@ -84,10 +84,10 @@ for _split in list(mathbench_sets.keys()):
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
role='HUMAN',
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
......@@ -100,14 +100,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench_v1/{_split}",
path=f'./data/mathbench_v1/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -6,67 +6,67 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
"cloze_cn": [
dict(role="HUMAN", prompt="Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?"),
dict(role="BOT", prompt="A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n"),
dict(role="HUMAN", prompt="Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?"),
dict(role="BOT", prompt="A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n"),
dict(role="HUMAN", prompt="Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?"),
dict(role="BOT", prompt="A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n"),
dict(role="HUMAN", prompt="Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?"),
dict(role="BOT", prompt="A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n"),
dict(role="HUMAN", prompt="Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?"),
dict(role="BOT", prompt="A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n"),
dict(role="HUMAN", prompt="Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?"),
dict(role="BOT", prompt="A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n"),
dict(role="HUMAN", prompt="Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?"),
dict(role="BOT", prompt="A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n"),
dict(role="HUMAN", prompt="Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?"),
dict(role="BOT", prompt="A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n"),
dict(role="HUMAN", prompt="Q: {question}"),
dict(role="BOT", prompt="A: {answer}"),
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
dict(role="HUMAN", prompt="Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?"),
dict(role="BOT", prompt="A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n"),
dict(role="HUMAN", prompt="Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"),
dict(role="BOT", prompt="A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n"),
dict(role="HUMAN", prompt="Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?"),
dict(role="BOT", prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role="HUMAN", prompt="Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?"),
dict(role="BOT", prompt="A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n"),
dict(role="HUMAN", prompt="Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?"),
dict(role="BOT", prompt="A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n"),
dict(role="HUMAN", prompt="Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?"),
dict(role="BOT", prompt="A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n"),
dict(role="HUMAN", prompt="Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?"),
dict(role="BOT", prompt="A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n"),
dict(role="HUMAN", prompt="Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?"),
dict(role="BOT", prompt="A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n"),
dict(role="HUMAN", prompt="Q: {question}"),
dict(role="BOT", prompt="A: {answer}\n"),
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
],
}
mathbench_sets = {
# Practice Part
"college": ["single_choice_cn", "single_choice_en"],
"high": ["single_choice_cn", "single_choice_en"],
"middle": ["single_choice_cn", "single_choice_en"],
"primary": ["cloze_cn", "cloze_en"],
"calculate": ["cloze_en"],
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'calculate': ['cloze_en'],
# Theory part
"college_knowledge": ["single_choice_cn", "single_choice_en"],
"high_knowledge": ["single_choice_cn", "single_choice_en"],
"middle_knowledge": ["single_choice_cn", "single_choice_en"],
"primary_knowledge": ["single_choice_cn", "single_choice_en"],
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
}
# Generate reasoning path or not, only for single choice
......@@ -80,14 +80,14 @@ mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_reader_cfg = dict(
input_columns=["question"],
output_column="answer",
input_columns=['question'],
output_column='answer',
)
if "single_choice" in _name:
if 'single_choice' in _name:
template_round = [
dict(role="HUMAN", prompt=(single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name])),
dict(role="BOT", prompt="{answer}")
dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])),
dict(role='BOT', prompt='{answer}')
]
else:
template_round = cloze_prompts[_name]
......@@ -97,11 +97,11 @@ for _split in list(mathbench_sets.keys()):
inferencer=dict(type=GenInferencer, max_out_len=512),
)
if "single_choice" in _name:
pred_postprocessor = dict(type=first_option_postprocess, options="ABCD")
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if "single_choice" in _name and with_circular_eval:
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
......@@ -112,9 +112,9 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + "-" + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench_v1_ori/{_split}",
path=f'./data/mathbench_v1_ori/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
......
......@@ -6,49 +6,49 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n',
}
cloze_prompts = {
"cloze_cn": [
"Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6",
"Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5",
"Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39",
"Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8",
"Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9",
"Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29",
"Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33",
"Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8",
"Q: {question}\nA: {answer}",
'cloze_cn': [
'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6',
'Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5',
'Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39',
'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8',
'Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9',
'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29',
'Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33',
'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8',
'Q: {question}\nA: {answer}',
],
"cloze_en": [
"Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.",
"Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.",
'cloze_en': [
'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.',
'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.',
"Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
"Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.",
"Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.",
"Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.",
"Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.",
"Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.",
"Q: {question}\nA: {answer}",
'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.',
'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.',
'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.',
'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.',
'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.',
'Q: {question}\nA: {answer}',
],
}
mathbench_sets = {
# Practice Part
"college": ["single_choice_cn", "single_choice_en"],
"high": ["single_choice_cn", "single_choice_en"],
"middle": ["single_choice_cn", "single_choice_en"],
"primary": ["cloze_cn", "cloze_en"],
"calculate": ["cloze_en"],
'college': ['single_choice_cn', 'single_choice_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn', 'single_choice_en'],
'primary': ['cloze_cn', 'cloze_en'],
'calculate': ['cloze_en'],
# Theory part
"college_knowledge": ["single_choice_cn", "single_choice_en"],
"high_knowledge": ["single_choice_cn", "single_choice_en"],
"middle_knowledge": ["single_choice_cn", "single_choice_en"],
"primary_knowledge": ["single_choice_cn", "single_choice_en"],
'college_knowledge': ['single_choice_cn', 'single_choice_en'],
'high_knowledge': ['single_choice_cn', 'single_choice_en'],
'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
}
# Generate reasoning path or not, only for single choice
......@@ -62,20 +62,20 @@ mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_reader_cfg = dict(
input_columns=["question"],
output_column="answer",
input_columns=['question'],
output_column='answer',
)
if "single_choice" in _name:
if 'single_choice' in _name:
if with_reasoning:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + "_with_reasoning"]),
prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
else:
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template={answer: f"{single_choice_prompts[_name]}{answer}" for answer in ['A', 'B', 'C', 'D']}),
prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
......@@ -87,11 +87,11 @@ for _split in list(mathbench_sets.keys()):
)
if "single_choice" in _name:
pred_postprocessor = dict(type=first_option_postprocess, options="ABCD")
if 'single_choice' in _name:
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
else:
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
if "single_choice" in _name and with_circular_eval:
if 'single_choice' in _name and with_circular_eval:
evaluator = dict(type=CircularEvaluator)
else:
evaluator = dict(type=AccEvaluator)
......@@ -102,9 +102,9 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + "-" + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench_v1_ori/{_split}",
path=f'./data/mathbench_v1_ori/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=mathbench_reader_cfg,
......
......@@ -7,7 +7,7 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
PROMPT_EN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
'FEWSHOT_INSTRUCTION_CLOZE' : [
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
dict(role='SYSTEM', prompt='Response:210'),
......@@ -25,7 +25,7 @@ PROMPT_EN = {
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
'FEWSHOT_INSTRUCTION_CHOICE' : [
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
......@@ -46,7 +46,7 @@ PROMPT_EN = {
}
PROMPT_CN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
'FEWSHOT_INSTRUCTION_CLOZE' : [
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球,8个3分球和10个罚球。他们的对手得到2分球的两倍,但3分球和罚球的一半。两队得分的总和是多少?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
dict(role='SYSTEM', prompt='Response:210'),
......@@ -64,7 +64,7 @@ PROMPT_CN = {
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
'FEWSHOT_INSTRUCTION_CHOICE' : [
dict(role='HUMAN', prompt='已知点P(-1,4)在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上,则k的值是____'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
......@@ -103,7 +103,7 @@ for _split in list(mathbench_sets.keys()):
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate,
template=dict(
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
round = prompt_example['FEWSHOT_INSTRUCTION_CLOZE'] if 'cloze' in _name else prompt_example['FEWSHOT_INSTRUCTION_CHOICE'])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer)
)
......@@ -114,14 +114,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
abbr='mathbench-' + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -5,7 +5,7 @@ from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
cloze_prompts ={
"cloze_arith_en": [
'cloze_arith_en': [
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
......@@ -45,13 +45,13 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=False,
abbr="mathbench-arithmetic" + _split + '-' + _name,
abbr='mathbench-arithmetic' + _split + '-' + _name,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -7,14 +7,14 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
"cloze_cn": [
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
......@@ -34,7 +34,7 @@ cloze_prompts = {
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
......@@ -80,10 +80,10 @@ for _split in list(mathbench_sets.keys()):
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
role='HUMAN',
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
......@@ -96,14 +96,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -7,14 +7,14 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
}
cloze_prompts = {
"cloze_cn": [
'cloze_cn': [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
......@@ -34,7 +34,7 @@ cloze_prompts = {
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
'cloze_en': [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
......@@ -80,10 +80,10 @@ for _split in list(mathbench_sets.keys()):
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
role='HUMAN',
prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
......@@ -96,14 +96,14 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
abbr='mathbench-' + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
path=f'./data/mathbench/{_split}',
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
input_columns=['question'],
output_column='answer'
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
......
......@@ -25,13 +25,13 @@ for name in medbench_single_choice_sets + medbench_multiple_choices_sets:
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role="HUMAN", prompt='{problem_input}')])),
round=[dict(role='HUMAN', prompt='{problem_input}')])),
retriever=dict(type=ZeroRetriever
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
inferencer=dict(type=GenInferencer))
medbench_eval_cfg = dict(
evaluator=dict(type=MedBenchEvaluator), pred_role="BOT")
evaluator=dict(type=MedBenchEvaluator), pred_role='BOT')
medbench_datasets.append(
dict(
......@@ -49,13 +49,13 @@ for name in medbench_qa_sets:
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role="HUMAN", prompt='{problem_input}')])),
round=[dict(role='HUMAN', prompt='{problem_input}')])),
retriever=dict(type=ZeroRetriever
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
inferencer=dict(type=GenInferencer))
medbench_eval_cfg = dict(
evaluator=dict(type=MedBenchEvaluator_NLG), pred_role="BOT")
evaluator=dict(type=MedBenchEvaluator_NLG), pred_role='BOT')
medbench_datasets.append(
dict(
......@@ -73,13 +73,13 @@ for name in medbench_cloze_sets:
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role="HUMAN", prompt='{problem_input}')])),
round=[dict(role='HUMAN', prompt='{problem_input}')])),
retriever=dict(type=ZeroRetriever
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
inferencer=dict(type=GenInferencer))
medbench_eval_cfg = dict(
evaluator=dict(type=MedBenchEvaluator_Cloze), pred_role="BOT")
evaluator=dict(type=MedBenchEvaluator_Cloze), pred_role='BOT')
medbench_datasets.append(
dict(
......@@ -97,13 +97,13 @@ for name in medbench_ie_sets:
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role="HUMAN", prompt='{problem_input}')])),
round=[dict(role='HUMAN', prompt='{problem_input}')])),
retriever=dict(type=ZeroRetriever
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
inferencer=dict(type=GenInferencer))
medbench_eval_cfg = dict(
evaluator=dict(type=eval('MedBenchEvaluator_'+name.replace('-', '_'))), pred_role="BOT")
evaluator=dict(type=eval('MedBenchEvaluator_'+name.replace('-', '_'))), pred_role='BOT')
medbench_datasets.append(
dict(
......@@ -116,4 +116,4 @@ for name in medbench_ie_sets:
infer_cfg=medbench_infer_cfg.copy(),
eval_cfg=medbench_eval_cfg.copy()))
del name, medbench_infer_cfg, medbench_eval_cfg
\ No newline at end of file
del name, medbench_infer_cfg, medbench_eval_cfg
......@@ -14,44 +14,44 @@ from opencompass.datasets.NPHardEval import (
)
NPHardEval_tasks = [
["hard_GCP", "GCP", hard_GCP_Dataset, hard_GCP_Evaluator],
["hard_TSP", "TSP", hard_TSP_Dataset, hard_TSP_Evaluator],
["hard_MSP", "MSP", hard_MSP_Dataset, hard_MSP_Evaluator],
["cmp_GCP_D", "GCP_Decision", cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator],
["cmp_TSP_D", "TSP_Decision", cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator],
["cmp_KSP", "KSP", cmp_KSP_Dataset, cmp_KSP_Evaluator],
["p_BSP", "BSP", p_BSP_Dataset, p_BSP_Evaluator],
["p_EDP", "EDP", p_EDP_Dataset, p_EDP_Evaluator],
["p_SPP", "SPP", p_SPP_Dataset, p_SPP_Evaluator],
['hard_GCP', 'GCP', hard_GCP_Dataset, hard_GCP_Evaluator],
['hard_TSP', 'TSP', hard_TSP_Dataset, hard_TSP_Evaluator],
['hard_MSP', 'MSP', hard_MSP_Dataset, hard_MSP_Evaluator],
['cmp_GCP_D', 'GCP_Decision', cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator],
['cmp_TSP_D', 'TSP_Decision', cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator],
['cmp_KSP', 'KSP', cmp_KSP_Dataset, cmp_KSP_Evaluator],
['p_BSP', 'BSP', p_BSP_Dataset, p_BSP_Evaluator],
['p_EDP', 'EDP', p_EDP_Dataset, p_EDP_Evaluator],
['p_SPP', 'SPP', p_SPP_Dataset, p_SPP_Evaluator],
]
NPHardEval_datasets = []
for name, path_name, dataset, evaluator in NPHardEval_tasks:
NPHardEval_reader_cfg = dict(input_columns=["prompt", "level"], output_column="q")
NPHardEval_reader_cfg = dict(input_columns=['prompt', 'level'], output_column='q')
NPHardEval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
begin='</E>',
round=[
dict(role="HUMAN", prompt="</E>{prompt}"),
dict(role="BOT", prompt=""),
dict(role='HUMAN', prompt='</E>{prompt}'),
dict(role='BOT', prompt=''),
],
),
ice_token="</E>",
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
NPHardEval_eval_cfg = dict(evaluator=dict(type=evaluator), pred_role="BOT")
NPHardEval_eval_cfg = dict(evaluator=dict(type=evaluator), pred_role='BOT')
NPHardEval_datasets.append(
dict(
type=dataset,
abbr=name,
path=f"./data/NPHardEval/{path_name}/",
path=f'./data/NPHardEval/{path_name}/',
reader_cfg=NPHardEval_reader_cfg,
infer_cfg=NPHardEval_infer_cfg,
eval_cfg=NPHardEval_eval_cfg,
......
......@@ -31,7 +31,7 @@ A. 2023年11月10日文一科技最新价34.01
B. 2023年11月10日文一科技成交额为2472820896
C. 文一科技的流通市值和总市值可能有误,因为流通市值5388200000元大于总市值5388204300元
D. 无明显错误数据
答案:
答案:
```
Output example I (from QWen-14B-Chat):
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment