Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    hint = f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。'
+    question_and_options = '题目：{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={answer: f'{question_and_options}\n答案是: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={answer: f'{hint}\n</E>{question_and_options}\n答案是: {answer}' for answer in ['A', 'B', 'C', 'D']},
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={
+                answer: dict(
+                    begin='</E>',
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。\n题目：{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
+                        ),
+                        dict(role='BOT', prompt=f'答案是: {answer}'),
+                    ])
+                for answer in ['A', 'B', 'C', 'D']
+            },
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_gen_3653db.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_gen_3653db.py
+"""
+Setting: 0-shot No-CoT
+Evaluator: GenericLLMEvaluator
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+cmmlu_subject_mapping = {
+    'anatomy': '解剖学',
+    'astronomy': '天文学',
+    'college_actuarial_science': '大学精算学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'computer_science': '计算机科学',
+    'conceptual_physics': '概念物理学',
+    'electrical_engineering': '电气工程',
+    'elementary_mathematics': '初等数学',
+    'genetics': '遗传学',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'machine_learning': '机器学习',
+    'virology': '病毒学',
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=CMMLUDataset,
+                path='opencompass/cmmlu',
+                name=_name,
+                abbr=f'cmmlu-{_name}',
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split='test'),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            judge_cfg=dict(),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_llmjudge_gen_3653db.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_llmjudge_gen_3653db.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+cmmlu_subject_mapping = {
+    'anatomy': '解剖学',
+    'astronomy': '天文学',
+    'college_actuarial_science': '大学精算学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'computer_science': '计算机科学',
+    'conceptual_physics': '概念物理学',
+    'electrical_engineering': '电气工程',
+    'elementary_mathematics': '初等数学',
+    'genetics': '遗传学',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'machine_learning': '机器学习',
+    'virology': '病毒学',
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_xml_gen_3653db.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_stem_0shot_nocot_xml_gen_3653db.py
+"""
+Setting: 0-shot No-CoT
+Evaluator: GenericLLMEvaluator
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.utils import xml_tag_postprocessor
+
+cmmlu_subject_mapping = {
+    'anatomy': '解剖学',
+    'astronomy': '天文学',
+    'college_actuarial_science': '大学精算学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'computer_science': '计算机科学',
+    'conceptual_physics': '概念物理学',
+    'electrical_engineering': '电气工程',
+    'elementary_mathematics': '初等数学',
+    'genetics': '遗传学',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'machine_learning': '机器学习',
+    'virology': '病毒学',
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=CMMLUDataset,
+                path='opencompass/cmmlu',
+                name=_name,
+                abbr=f'cmmlu-{_name}',
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split='test'),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
+            judge_cfg=dict(),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmo_fib/README.md
+++ b/opencompass/configs/datasets/cmo_fib/README.md
+### Description
+
+Math dataset composed of problems from CMO (Chinese Mathematical Olympiad) 2009-2022 . 
+
+### Performance
+
+| Qwen2.5-Math-72B-Instruct | Qwen2.5-Math-7B-Instruct | Qwen2-Math-7B-Instruct | Qwen2-Math-1.5B-Instruct | internlm2-math-7b |
+| ----------- | ----------- | ----------- | ----------- | ----------- |
+| 46.15 | 42.79 | 31.73 | 23.56 | 3.37 |
+
+| Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat |
+| ----------- | ----------- | ----------- |
+| 20.00 | 16.67 | 6.67 |
\ No newline at end of file
--- a/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
+
+
+cmo_fib_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+cmo_fib_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+
+cmo_fib_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
+)
+
+cmo_fib_datasets = [
+    dict(
+        abbr='cmo_fib',
+        type=CMOFibDataset,
+        path='opencompass/cmo_fib',
+        reader_cfg=cmo_fib_reader_cfg,
+        infer_cfg=cmo_fib_infer_cfg,
+        eval_cfg=cmo_fib_eval_cfg
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .cmo_fib_gen_ace24b import cmo_fib_datasets  # noqa: F401, F403
\ No newline at end of file
--- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
+
+
+cmo_fib_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+cmo_fib_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\n请一步一步地推理，并将最终答案写入\\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+
+cmo_fib_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
+)
+
+cmo_fib_datasets = [
+    dict(
+        abbr='cmo_fib',
+        type=CMOFibDataset,
+        path='opencompass/cmo_fib',
+        reader_cfg=cmo_fib_reader_cfg,
+        infer_cfg=cmo_fib_infer_cfg,
+        eval_cfg=cmo_fib_eval_cfg
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/collections/base_core.py
+++ b/opencompass/configs/datasets/collections/base_core.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
+    from ..ceval.ceval_ppl_1cd8bf import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_20a989 import nq_datasets
+    from ..race.race_ppl_abed12 import race_datasets
+    from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
+    from ..bbh.bbh_gen_98fba6 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
+    from ..humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
+    from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/base_medium.py
+++ b/opencompass/configs/datasets/collections/base_medium.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ..ceval.ceval_ppl_578f8d import ceval_datasets
+    from ..agieval.agieval_mixed_713d14 import agieval_datasets
+    from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
+    from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
+    from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
+    from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
+    from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
+    from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
+    from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
+    from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
+    from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ..lambada.lambada_gen_217e11 import lambada_datasets
+    from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
+    from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
+    from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
+    from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
+    from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
+    from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
+    from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
+    from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
+    from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
+    from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ..race.race_ppl_a138cd import race_datasets
+    from ..Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
+    from ..math.math_gen_265cce import math_datasets
+    from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
+    from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
+    from ..ARC_e.ARC_e_ppl_a450bd import ARC_e_datasets
+    from ..ARC_c.ARC_c_ppl_a450bd import ARC_c_datasets
+    from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
+    from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
+    from ..siqa.siqa_ppl_ced5f6 import siqa_datasets
+    from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
+    from ..obqa.obqa_ppl_c7c154 import obqa_datasets
+    from ..nq.nq_gen_c788f6 import nq_datasets
+    from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from ..flores.flores_gen_806ede import flores_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/base_medium_llama.py
+++ b/opencompass/configs/datasets/collections/base_medium_llama.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ..ceval.ceval_ppl_578f8d import ceval_datasets
+    from ..agieval.agieval_mixed_713d14 import agieval_datasets
+    from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
+    from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
+    from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
+    from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
+    from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
+    from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
+    from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
+    from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
+    from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ..lambada.lambada_gen_217e11 import lambada_datasets
+    from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
+    from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
+    from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
+    from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
+    from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
+    from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
+    from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
+    from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
+    from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
+    from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ..race.race_ppl_5831a0 import race_datasets
+    from ..Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
+    from ..math.math_gen_265cce import math_datasets
+    from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
+    from ..hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
+    from ..ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
+    from ..ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
+    from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
+    from ..piqa.piqa_ppl_0cfff2 import piqa_datasets
+    from ..siqa.siqa_ppl_e8d8c5 import siqa_datasets
+    from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
+    from ..obqa.obqa_ppl_6aac9e import obqa_datasets
+    from ..nq.nq_gen_0356ec import nq_datasets
+    from ..triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
+    from ..flores.flores_gen_806ede import flores_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/base_small.py
+++ b/opencompass/configs/datasets/collections/base_small.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..ceval.ceval_ppl_578f8d import ceval_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
+    from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
+    from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
+    from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ..lambada.lambada_gen_217e11 import lambada_datasets
+    from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
+    from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
+    from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
+    from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
+    from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
+    from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
+    from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
+    from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
+    from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
+    from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_d0f531 import WSC_datasets
+    from ..race.race_ppl_a138cd import race_datasets
+    from ..math.math_gen_265cce import math_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
+    from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
+    from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
+    from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
+    from ..obqa.obqa_ppl_c7c154 import obqa_datasets
+    from ..nq.nq_gen_c788f6 import nq_datasets
+    from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/chat_core.py
+++ b/opencompass/configs/datasets/collections/chat_core.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_2e45e5 import nq_datasets
+    from ..race.race_gen_69ee4f import race_datasets
+    from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/chat_medium.py
+++ b/opencompass/configs/datasets/collections/chat_medium.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..agieval.agieval_gen_64afd3 import agieval_datasets
+    from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from ..CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
+    from ..CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
+    from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
+    from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
+    from ..FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
+    from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
+    from ..FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
+    from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ..lambada.lambada_gen_217e11 import lambada_datasets
+    from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
+    from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
+    from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
+    from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
+    from ..race.race_gen_69ee4f import race_datasets
+    from ..Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..summedits.summedits_gen_315438 import summedits_datasets
+    from ..math.math_gen_265cce import math_datasets
+    from ..TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets
+    from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from ..ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from ..commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
+    from ..piqa.piqa_gen_1194eb import piqa_datasets
+    from ..siqa.siqa_gen_e78df3 import siqa_datasets
+    from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
+    from ..obqa.obqa_gen_9069e4 import obqa_datasets
+    from ..nq.nq_gen_c788f6 import nq_datasets
+    from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from ..flores.flores_gen_806ede import flores_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/chat_small.py
+++ b/opencompass/configs/datasets/collections/chat_small.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
+    from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
+    from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
+    from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ..lambada.lambada_gen_217e11 import lambada_datasets
+    from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
+    from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
+    from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
+    from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_fe4bf3 import WSC_datasets
+    from ..race.race_gen_69ee4f import race_datasets
+    from ..math.math_gen_265cce import math_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..summedits.summedits_gen_315438 import summedits_datasets
+    from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from ..piqa.piqa_gen_1194eb import piqa_datasets
+    from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
+    from ..obqa.obqa_gen_9069e4 import obqa_datasets
+    from ..nq.nq_gen_c788f6 import nq_datasets
+    from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/example.py
+++ b/opencompass/configs/datasets/collections/example.py
+from mmengine.config import read_base
+
+with read_base():
+    from ..piqa.piqa_gen_1194eb import piqa_datasets
+    from ..nq.nq_gen_c788f6 import nq_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/leaderboard/qwen.py
+++ b/opencompass/configs/datasets/collections/leaderboard/qwen.py
+from mmengine.config import read_base
+
+with read_base():
+    from ...ceval.ceval_ppl_578f8d import ceval_datasets
+    from ...agieval.agieval_mixed_713d14 import agieval_datasets
+    from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
+    from ...GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
+    from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+
+    from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
+    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
+    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
+    from ...flores.flores_gen_806ede import flores_datasets
+
+    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
+    from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
+    from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
+    from ...nq.nq_gen_0356ec import nq_datasets
+
+    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ...race.race_ppl_5831a0 import race_datasets
+    from ...obqa.obqa_gen_9069e4 import obqa_datasets
+    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from ...lambada.lambada_gen_217e11 import lambada_datasets
+
+    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
+    from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from ...piqa.piqa_gen_1194eb import piqa_datasets
+    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
+    from ...math.math_gen_265cce import math_datasets
+    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
+    from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
+    from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ...bbh.bbh_gen_5bf00b import bbh_datasets
+
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/collections/leaderboard/qwen_chat.py
+++ b/opencompass/configs/datasets/collections/leaderboard/qwen_chat.py
+from mmengine.config import read_base
+
+with read_base():
+    from ...ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ...agieval.agieval_mixed_713d14 import agieval_datasets
+    from ...mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
+    from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
+
+    from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
+    from ...flores.flores_gen_806ede import flores_datasets
+
+    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
+    from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
+    from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from ...nq.nq_gen_c788f6 import nq_datasets
+
+    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ...race.race_gen_69ee4f import race_datasets
+    from ...obqa.obqa_ppl_6aac9e import obqa_datasets
+    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
+    from ...lambada.lambada_gen_217e11 import lambada_datasets
+
+    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
+    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
+    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
+    from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
+    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
+    from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
+    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
+    from ...math.math_gen_265cce import math_datasets
+    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
+    from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
+    from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from ...bbh.bbh_gen_5b92b0 import bbh_datasets
+
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/commonsenseqa/commonsenseqa_7shot_cot_gen_734a22.py
+++ b/opencompass/configs/datasets/commonsenseqa/commonsenseqa_7shot_cot_gen_734a22.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+from opencompass.utils.text_postprocessors import (
+    match_answer_pattern,
+)
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template=dict(
+        begin='</E>',
+        round=[
+            dict(
+                role='HUMAN',
+                prompt='Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: A.shirt pocket B.calligrapher’s hand C.inkwell D.desk drawer E.blotter',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink. So the answer is E.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: What home entertainment equipment requires cable?Answer Choices: A.radio shack B.substation C.television D.cabinet',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must require cable. Of the above choices, only television requires cable. So the answer is C.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: A.pretty flowers B.hen house C.natural habitat D.storybook',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the answer is B.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: A.populated areas B.race track C.desert D.apartment E.roadblock',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of people. So the answer is A.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Where do you put your grapes just before checking out? Answer Choices: A.mouth B.grocery cart Csuper market D.fruit basket E.fruit market',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer should be the place where grocery items are placed before checking out. Of the above choices, grocery cart makes the most sense for holding grocery items. So the answer is B.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Google Maps and other highway and street GPS services have replaced what? Answer Choices: A.united states B.mexico C.countryside D.atlas',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer must be something that used to do what Google Maps and GPS services do, which is to give directions. Of the above choices, only atlases are used to give directions. So the answer is D.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q: Before getting a divorce, what did the wife feel who was doing all the work? Answer Choices: A.harder B.anguish C.bitterness D.tears E.sadness',
+            ),
+            dict(
+                role='BOT',
+                prompt='A: The answer should be the feeling of someone getting divorced who was doing all the work. Of the above choices, the closest feeling is bitterness. So the answer is C.',
+            ),
+            dict(
+                role='HUMAN',
+                prompt='Q:{question}  Answer Choices: A. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nA:',
+            ),
+            dict(
+                role='BOT',
+                prompt='{answerKey}',
+            ),
+        ],
+    ),
+    ice_token='</E>',
+)
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+commonsenseqa_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(
+        type=match_answer_pattern, answer_pattern=r'(?i)so the answer is\s*([A-P])'
+    ),
+)
+
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg,
+    )
+]
+
+del _ice_template