"vscode:/vscode.git/clone" did not exist on "152aab304d4eabe4f992def6296156642e11c28d"
Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
hint = f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。'
question_and_options = '题目:{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
cmmlu_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={answer: f'{question_and_options}\n答案是: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={answer: f'{hint}\n</E>{question_and_options}\n答案是: {answer}' for answer in ['A', 'B', 'C', 'D']},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
cmmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
))
del _name, _ch_name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
cmmlu_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
answer: dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
),
dict(role='BOT', prompt=f'答案是: {answer}'),
])
for answer in ['A', 'B', 'C', 'D']
},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
))
del _name, _ch_name
"""
Setting: 0-shot No-CoT
Evaluator: GenericLLMEvaluator
"""
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
cmmlu_subject_mapping = {
'anatomy': '解剖学',
'astronomy': '天文学',
'college_actuarial_science': '大学精算学',
'college_engineering_hydrology': '大学工程水文学',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'computer_science': '计算机科学',
'conceptual_physics': '概念物理学',
'electrical_engineering': '电气工程',
'elementary_mathematics': '初等数学',
'genetics': '遗传学',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'machine_learning': '机器学习',
'virology': '病毒学',
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
judge_cfg=dict(),
),
pred_role='BOT',
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
mode='singlescore',
))
del _name, _ch_name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
cmmlu_subject_mapping = {
'anatomy': '解剖学',
'astronomy': '天文学',
'college_actuarial_science': '大学精算学',
'college_engineering_hydrology': '大学工程水文学',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'computer_science': '计算机科学',
'conceptual_physics': '概念物理学',
'electrical_engineering': '电气工程',
'elementary_mathematics': '初等数学',
'genetics': '遗传学',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'machine_learning': '机器学习',
'virology': '病毒学',
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
mode='singlescore',
))
del _name, _ch_name
"""
Setting: 0-shot No-CoT
Evaluator: GenericLLMEvaluator
"""
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.utils import xml_tag_postprocessor
cmmlu_subject_mapping = {
'anatomy': '解剖学',
'astronomy': '天文学',
'college_actuarial_science': '大学精算学',
'college_engineering_hydrology': '大学工程水文学',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'computer_science': '计算机科学',
'conceptual_physics': '概念物理学',
'electrical_engineering': '电气工程',
'elementary_mathematics': '初等数学',
'genetics': '遗传学',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'machine_learning': '机器学习',
'virology': '病毒学',
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
judge_cfg=dict(),
),
pred_role='BOT',
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
mode='singlescore',
))
del _name, _ch_name
### Description
Math dataset composed of problems from CMO (Chinese Mathematical Olympiad) 2009-2022 .
### Performance
| Qwen2.5-Math-72B-Instruct | Qwen2.5-Math-7B-Instruct | Qwen2-Math-7B-Instruct | Qwen2-Math-1.5B-Instruct | internlm2-math-7b |
| ----------- | ----------- | ----------- | ----------- | ----------- |
| 46.15 | 42.79 | 31.73 | 23.56 | 3.37 |
| Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat |
| ----------- | ----------- | ----------- |
| 20.00 | 16.67 | 6.67 |
\ No newline at end of file
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
cmo_fib_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
cmo_fib_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048)
)
cmo_fib_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
)
cmo_fib_datasets = [
dict(
abbr='cmo_fib',
type=CMOFibDataset,
path='opencompass/cmo_fib',
reader_cfg=cmo_fib_reader_cfg,
infer_cfg=cmo_fib_infer_cfg,
eval_cfg=cmo_fib_eval_cfg
)
]
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from .cmo_fib_gen_ace24b import cmo_fib_datasets # noqa: F401, F403
\ No newline at end of file
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
cmo_fib_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
cmo_fib_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\n请一步一步地推理,并将最终答案写入\\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048)
)
cmo_fib_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
)
cmo_fib_datasets = [
dict(
abbr='cmo_fib',
type=CMOFibDataset,
path='opencompass/cmo_fib',
reader_cfg=cmo_fib_reader_cfg,
infer_cfg=cmo_fib_infer_cfg,
eval_cfg=cmo_fib_eval_cfg
)
]
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
from ..ceval.ceval_ppl_1cd8bf import ceval_datasets
from ..GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import GaokaoBench_datasets
from ..triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets
from ..nq.nq_open_1shot_gen_20a989 import nq_datasets
from ..race.race_ppl_abed12 import race_datasets
from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets
from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
from ..bbh.bbh_gen_98fba6 import bbh_datasets
from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
from ..humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
from ..ceval.ceval_ppl_578f8d import ceval_datasets
from ..agieval.agieval_mixed_713d14 import agieval_datasets
from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ..race.race_ppl_a138cd import race_datasets
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
from ..math.math_gen_265cce import math_datasets
from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
from ..ARC_e.ARC_e_ppl_a450bd import ARC_e_datasets
from ..ARC_c.ARC_c_ppl_a450bd import ARC_c_datasets
from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
from ..siqa.siqa_ppl_ced5f6 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from ..flores.flores_gen_806ede import flores_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
from ..ceval.ceval_ppl_578f8d import ceval_datasets
from ..agieval.agieval_mixed_713d14 import agieval_datasets
from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ..race.race_ppl_5831a0 import race_datasets
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
from ..math.math_gen_265cce import math_datasets
from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
from ..hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
from ..ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
from ..ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
from ..piqa.piqa_ppl_0cfff2 import piqa_datasets
from ..siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_6aac9e import obqa_datasets
from ..nq.nq_gen_0356ec import nq_datasets
from ..triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
from ..flores.flores_gen_806ede import flores_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..ceval.ceval_ppl_578f8d import ceval_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_d0f531 import WSC_datasets
from ..race.race_ppl_a138cd import race_datasets
from ..math.math_gen_265cce import math_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
from ..triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
from ..nq.nq_open_1shot_gen_2e45e5 import nq_datasets
from ..race.race_gen_69ee4f import race_datasets
from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
from ..agieval.agieval_gen_64afd3 import agieval_datasets
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
from ..CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
from ..CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
from ..FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
from ..FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
from ..race.race_gen_69ee4f import race_datasets
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..summedits.summedits_gen_315438 import summedits_datasets
from ..math.math_gen_265cce import math_datasets
from ..TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets
from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from ..ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ..ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ..commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
from ..piqa.piqa_gen_1194eb import piqa_datasets
from ..siqa.siqa_gen_e78df3 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
from ..obqa.obqa_gen_9069e4 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from ..flores.flores_gen_806ede import flores_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_fe4bf3 import WSC_datasets
from ..race.race_gen_69ee4f import race_datasets
from ..math.math_gen_265cce import math_datasets
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..summedits.summedits_gen_315438 import summedits_datasets
from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from ..piqa.piqa_gen_1194eb import piqa_datasets
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
from ..obqa.obqa_gen_9069e4 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ..piqa.piqa_gen_1194eb import piqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ...ceval.ceval_ppl_578f8d import ceval_datasets
from ...agieval.agieval_mixed_713d14 import agieval_datasets
from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
from ...GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
from ...flores.flores_gen_806ede import flores_datasets
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
from ...nq.nq_gen_0356ec import nq_datasets
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ...race.race_ppl_5831a0 import race_datasets
from ...obqa.obqa_gen_9069e4 import obqa_datasets
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
from ...lambada.lambada_gen_217e11 import lambada_datasets
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from ...piqa.piqa_gen_1194eb import piqa_datasets
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ...math.math_gen_265cce import math_datasets
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_5bf00b import bbh_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base
with read_base():
from ...ceval.ceval_gen_5f30c7 import ceval_datasets
from ...agieval.agieval_mixed_713d14 import agieval_datasets
from ...mmlu.mmlu_gen_4d595a import mmlu_datasets
from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
from ...flores.flores_gen_806ede import flores_datasets
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from ...nq.nq_gen_c788f6 import nq_datasets
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ...race.race_gen_69ee4f import race_datasets
from ...obqa.obqa_ppl_6aac9e import obqa_datasets
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ...lambada.lambada_gen_217e11 import lambada_datasets
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ...math.math_gen_265cce import math_datasets
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_5b92b0 import bbh_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import commonsenseqaDataset
from opencompass.utils.text_postprocessors import (
match_answer_pattern,
)
commonsenseqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
output_column='answerKey',
test_split='validation',
)
_ice_template = dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt='Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: A.shirt pocket B.calligrapher’s hand C.inkwell D.desk drawer E.blotter',
),
dict(
role='BOT',
prompt='A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink. So the answer is E.',
),
dict(
role='HUMAN',
prompt='Q: What home entertainment equipment requires cable?Answer Choices: A.radio shack B.substation C.television D.cabinet',
),
dict(
role='BOT',
prompt='A: The answer must require cable. Of the above choices, only television requires cable. So the answer is C.',
),
dict(
role='HUMAN',
prompt='Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: A.pretty flowers B.hen house C.natural habitat D.storybook',
),
dict(
role='BOT',
prompt='A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the answer is B.',
),
dict(
role='HUMAN',
prompt='Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: A.populated areas B.race track C.desert D.apartment E.roadblock',
),
dict(
role='BOT',
prompt='A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of people. So the answer is A.',
),
dict(
role='HUMAN',
prompt='Q: Where do you put your grapes just before checking out? Answer Choices: A.mouth B.grocery cart Csuper market D.fruit basket E.fruit market',
),
dict(
role='BOT',
prompt='A: The answer should be the place where grocery items are placed before checking out. Of the above choices, grocery cart makes the most sense for holding grocery items. So the answer is B.',
),
dict(
role='HUMAN',
prompt='Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: A.united states B.mexico C.countryside D.atlas',
),
dict(
role='BOT',
prompt='A: The answer must be something that used to do what Google Maps and GPS services do, which is to give directions. Of the above choices, only atlases are used to give directions. So the answer is D.',
),
dict(
role='HUMAN',
prompt='Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: A.harder B.anguish C.bitterness D.tears E.sadness',
),
dict(
role='BOT',
prompt='A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above choices, the closest feeling is bitterness. So the answer is C.',
),
dict(
role='HUMAN',
prompt='Q:{question} Answer Choices: A. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nA:',
),
dict(
role='BOT',
prompt='{answerKey}',
),
],
),
ice_token='</E>',
)
commonsenseqa_infer_cfg = dict(
ice_template=_ice_template,
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
commonsenseqa_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern, answer_pattern=r'(?i)so the answer is\s*([A-P])'
),
)
commonsenseqa_datasets = [
dict(
abbr='commonsense_qa',
type=commonsenseqaDataset,
path='opencompass/commonsense_qa',
reader_cfg=commonsenseqa_reader_cfg,
infer_cfg=commonsenseqa_infer_cfg,
eval_cfg=commonsenseqa_eval_cfg,
)
]
del _ice_template
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment