Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import CEvalDataset
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val', 'test']:
for _name in ceval_all_sets:
_ch_name = ceval_subject_mapping[_name][1]
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
answer: dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
),
dict(role='BOT', prompt=answer),
])
for answer in ['A', 'B', 'C', 'D']
},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
ceval_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='./data/ceval_internal/formal_ceval',
local_mode=True,
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split),
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
del _split, _name, _ch_name
from mmengine.config import read_base
with read_base():
from .ceval_ppl_578f8d import ceval_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CEvalDataset
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val', 'test']:
for _name in ceval_all_sets:
ceval_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split,
)
_ch_name = ceval_subject_mapping[_name][1]
hint = f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。'
question_and_options = '{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={answer: f'{question_and_options}\n答案: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={answer: f'{hint}\n</E>{question_and_options}\n答案: {answer}' for answer in ['A', 'B', 'C', 'D']},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='opencompass/ceval-exam',
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name,
reader_cfg=ceval_reader_cfg,
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CEvalDataset
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val']:
for _name in ceval_all_sets:
_ch_name = ceval_subject_mapping[_name][1]
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
answer: dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
),
dict(role='BOT', prompt=answer),
])
for answer in ['A', 'B', 'C', 'D']
},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='opencompass/ceval-exam',
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split),
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
del _split, _name, _ch_name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CEvalDataset
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val', 'test']:
for _name in ceval_all_sets:
_ch_name = ceval_subject_mapping[_name][1]
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
answer: dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
),
dict(role='BOT', prompt=answer),
])
for answer in ['A', 'B', 'C', 'D']
},
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer),
)
ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='opencompass/ceval-exam',
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split),
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
del _split, _name, _ch_name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever, ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CEvalDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
ceval_subject_mapping = {
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
'operating_system': ['Operating System', '操作系统', 'STEM'],
'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
'college_programming': ['College Programming', '大学编程', 'STEM'],
'college_physics': ['College Physics', '大学物理', 'STEM'],
'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
'college_economics': ['College Economics', '大学经济学', 'Social Science'],
'business_administration': ['Business Administration', '工商管理', 'Social Science'],
'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
'education_science': ['Education Science', '教育学', 'Social Science'],
'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
'logic': ['Logic', '逻辑学', 'Humanities'],
'law': ['Law', '法学', 'Humanities'],
'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
'art_studies': ['Art Studies', '艺术学', 'Humanities'],
'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
'high_school_history': ['High School History', '高中历史', 'Humanities'],
'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
'civil_servant': ['Civil Servant', '公务员', 'Other'],
'sports_science': ['Sports Science', '体育学', 'Other'],
'plant_protection': ['Plant Protection', '植物保护', 'Other'],
'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
'accountant': ['Accountant', '注册会计师', 'Other'],
'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
'physician': ['Physician', '医师资格', 'Other'],
}
ceval_all_sets = list(ceval_subject_mapping.keys())
ceval_datasets = []
for _split in ['val']:
for _name in ceval_all_sets:
_ch_name = ceval_subject_mapping[_name][1]
ceval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n让我们一步一步思考。答案: '
),
dict(role='BOT', prompt='{answer}'),
]),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ceval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
ceval_datasets.append(
dict(
type=CEvalDataset,
path='opencompass/ceval-exam',
name=_name,
abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
_name,
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split=_split),
infer_cfg=ceval_infer_cfg,
eval_cfg=ceval_eval_cfg,
))
# Overview
<p align="center">
🌐 <a href="https://openstellarteam.github.io/ChineseSimpleQA/" target="_blank">Website</a> • 🤗 <a href="https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA" target="_blank">Hugging Face</a> • ⏬ <a href="#data" target="_blank">Data</a> • 📃 <a href="https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA" target="_blank">Paper</a> • 📊 <a href="http://47.109.32.164/" target="_blank">Leaderboard</a> <br> <a href="https://github.com/OpenStellarTeam/ChineseSimpleQA/blob/master/README_zh.md"> 中文</a> | <a href="https://github.com/OpenStellarTeam/ChineseSimpleQA/blob/master/README.md">English
</p>
**Chinese SimpleQA** is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifically, our benchmark covers **6 major topics** with **99 diverse subtopics**.
Please visit our [website](https://openstellarteam.github.io/ChineseSimpleQA/) or check our [paper](https://arxiv.org/abs/2411.07140) for more details.
## 💫 Instroduction
* How to solve the generative hallucination of models has always been an unsolved problem in the field of artificial intelligence (AI). In order to measure the factual correctness of language models, OpenAI recently released and open-sourced a test set called SimpleQA. We have also been paying attention to the field of factuality, which currently has problems such as outdated data, inaccurate evaluation, and incomplete coverage. For example, the knowledge evaluation sets widely used now are still CommonSenseQA, CMMLU, and C-Eval, which are multiple-choice question-based evaluation sets. **In order to further promote the research of the Chinese community on the factual correctness of models, we propose the Chinese SimpleQA**. which consists of 3000 high-quality questions spanning 6 major topics, ranging from humanities to science and engineering. Specifically, the distinct main features of our proposed Chinese SimpleQA dataset are as follows:
* 🀄**Chinese:** Our Chinese SimpleQA focuses on the Chinese language, which provides a comprehensive evaluation of the factuality abilities of existing LLMs in Chinese.
* 🍀**Diverse:** Chinese SimpleQA covers 6 topics (i.e., “Chinese Culture”, “Humanities”, “Engineering, Technology, and Applied Sciences”, “Life, Art, and Culture”, “Society”, and “Natural Science”), and these topic includes 99 fine-grained subtopics in total, which demonstrates the diversity of our Chinese SimpleQA.
***High-quality:** We conduct a comprehensive and rigorous quality control process to ensure the quality and accuracy of our Chinese SimpleQA.
* 💡**Static:** Following SimpleQA, to preserve the evergreen property of Chinese SimpleQA, all reference answers would not change over time.
* 🗂️**Easy-to-evaluate:** Following SimpleQA, as the questions and answers are very short, the grading procedure is fast to run via existing LLMs (e.g., OpenAI API).
- Based on Chinese SimpleQA, we have conducted a comprehensive evaluation of the factual capabilities of existing LLMs. We also maintain a comprehensive leaderboard list.
- In short, we hope that Chinese SimpleQA can help developers gain a deeper understanding of the factual correctness of their models in the Chinese field, and at the same time provide an important cornerstone for their algorithm research, and jointly promote the growth of Chinese basic models.
## 📊 Leaderboard
详见: [📊](http://47.109.32.164/)
## ⚖️ Evals
We provide three evaluation methods.
(1) The first method is based on simple-evals evaluation. The startup command is as follows:
```bash
python -m simple-evals.demo
```
This will launch evaluations through the OpenAI API.
(2) The second is a simple single evaluation script that we wrote from scratch. The startup command is as follows:
- Step1: set your openai key in scripts/chinese_simpleqa_easy.py:
```
os.environ["OPENAI_API_KEY"] = "replace your key here"
```
- Step2: run the eval script:
```
python scripts/chinese_simpleqa_easy.py
```
- Step3: we also provide a unified processing script for multiple model results. After running it, you can get a complete leaderboard:
```
python scripts/get_leaderboard.py
```
(3) We also integrated our Chinese SimpleQA benchmark into our forked [OpenCompass](https://github.com/open-compass/opencompass). You can refer to the opencompass configuration script for evaluation
- Step1: git clone Opencompass:
```shell
cd ~
git clone git@github.com:open-compass/opencompass.git
cd opencompass
```
- Step2: download Chinese Simpleqa data from [huggingface](https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA), and put it in the following path(OPENCOMPASS_PATH/data/chinese_simpleqa), make sure you get path like this:
```
~/opencompass/data/
└── chinese_simpleqa
├── chinese_simpleqa.jsonl
```
- Step3: configuration your launch in configs/eval_chinese_simpleqa.py, set your models to be evaluated, set your judge model (we recommend to use gpt4o) and launch it!
```
python run.py configs/eval_chinese_simpleqa.py
```
## Citation
Please cite our paper if you use our dataset.
```
@misc{he2024chinesesimpleqachinesefactuality,
title={Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language Models},
author={Yancheng He and Shilong Li and Jiaheng Liu and Yingshui Tan and Weixun Wang and Hui Huang and Xingyuan Bu and Hangyu Guo and Chengwei Hu and Boren Zheng and Zhuoran Lin and Xuepeng Liu and Dekai Sun and Shirong Lin and Zhicheng Zheng and Xiaoyong Zhu and Wenbo Su and Bo Zheng},
year={2024},
eprint={2411.07140},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.07140},
}
```
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CsimpleqaDataset, csimpleqa_postprocess
subjective_reader_cfg = dict(input_columns=['primary_category', 'question','gold_ans', 'messages', 'system_prompt','prompt_template'], output_column='judge')
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]
),
),
dict_postprocessor=dict(type=csimpleqa_postprocess),
),
pred_role='BOT',
)
csimpleqa_datasets = [
dict(
abbr='chinese_simpleqa',
type=CsimpleqaDataset,
name='chinese_simpleqa',
path='opencompass/chinese_simpleqa',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
)
]
from mmengine.config import read_base
with read_base():
from .civilcomments_clp_a3c5fd import civilcomments_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import CLPInferencer
from opencompass.openicl.icl_evaluator import AUCROCEvaluator
from opencompass.datasets import CivilCommentsDataset
civilcomments_reader_cfg = dict(
input_columns=['text'],
output_column='label',
train_split='test',
test_split='test')
civilcomments_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Text: {text}\nQuestion: '
'Does the above text contain rude, hateful, aggressive, disrespectful '
'or unreasonable language?\nAnswer:'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=CLPInferencer))
civilcomments_eval_cfg = dict(evaluator=dict(type=AUCROCEvaluator), )
civilcomments_datasets = [
dict(
type=CivilCommentsDataset,
path='civil_comments',
reader_cfg=civilcomments_reader_cfg,
infer_cfg=civilcomments_infer_cfg,
eval_cfg=civilcomments_eval_cfg)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import CLPInferencer
from opencompass.openicl.icl_evaluator import AUCROCEvaluator
from opencompass.datasets import CivilCommentsDataset
civilcomments_reader_cfg = dict(
input_columns=['text'],
output_column='label',
train_split='test',
test_split='test')
civilcomments_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Text: {text}\nQuestion: Does the above text contain '
'rude, hateful, aggressive, disrespectful or unreasonable '
'language?\nAnswer:')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=CLPInferencer))
civilcomments_eval_cfg = dict(evaluator=dict(type=AUCROCEvaluator), )
civilcomments_datasets = [
dict(
type=CivilCommentsDataset,
path='civil_comments',
reader_cfg=civilcomments_reader_cfg,
infer_cfg=civilcomments_infer_cfg,
eval_cfg=civilcomments_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .clozeTest_maxmin_gen_c205fb import maxmin_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MaxminDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
maxmin_reader_cfg = dict(
input_columns=['nl_tokens', 'pl_tokens'],
output_column='answer',
)
maxmin_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt="Code:{pl_tokens}\nThe aim of the code: {nl_tokens}\nQuestion: Please tell me what \"<mask>\" in the code should be replaced with and you must response to me only A or B.\nA. max\nB. min\nAnswer:"),
dict(role='BOT', prompt='{answer}'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
maxmin_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=first_capital_postprocess))
maxmin_datasets = [
dict(
type=MaxminDataset,
abbr=f'maxmin',
test_path='opencompass/clozeTest_maxmin',
answer_path='opencompass/clozeTest_maxmin_answers',
reader_cfg=maxmin_reader_cfg,
infer_cfg=maxmin_infer_cfg,
eval_cfg=maxmin_eval_cfg,
)
]
from mmengine.config import read_base
with read_base():
from .cmb_gen_dfb5c4 import cmb_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CMBDataset
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.utils.text_postprocessors import multiple_select_postprocess
cmb_datasets = []
for split in ['val', 'test']:
cmb_reader_cfg = dict(
input_columns=['exam_type', 'exam_class', 'question_type', 'question', 'option_str'],
output_column='answer',
train_split=split,
test_split=split,
)
cmb_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=f'以下是中国{{exam_type}}中{{exam_class}}考试的一道{{question_type}},不需要做任何分析和解释,直接输出答案选项。\n{{question}}\n{{option_str}} \n 答案: ',
),
dict(role='BOT', prompt='{answer}'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=10),
)
cmb_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=multiple_select_postprocess),
)
cmb_datasets.append(
dict(
abbr='cmb' if split == 'val' else 'cmb_test',
type=CMBDataset,
path='./data/CMB/',
reader_cfg=cmb_reader_cfg,
infer_cfg=cmb_infer_cfg,
eval_cfg=cmb_eval_cfg,
)
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(
type=match_answer_pattern,
# answer_pattern=r'(?i)答案\s*:\s*([A-D])'
answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
)
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
))
del _name, _ch_name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
QUERY_TEMPLATE = """
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
{question}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
cmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
mode='singlescore',
))
del _name, _ch_name
from mmengine.config import read_base
with read_base():
from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import CMMLUDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
cmmlu_subject_mapping = {
'agronomy': '农学',
'anatomy': '解剖学',
'ancient_chinese': '古汉语',
'arts': '艺术学',
'astronomy': '天文学',
'business_ethics': '商业伦理',
'chinese_civil_service_exam': '中国公务员考试',
'chinese_driving_rule': '中国驾驶规则',
'chinese_food_culture': '中国饮食文化',
'chinese_foreign_policy': '中国外交政策',
'chinese_history': '中国历史',
'chinese_literature': '中国文学',
'chinese_teacher_qualification': '中国教师资格',
'clinical_knowledge': '临床知识',
'college_actuarial_science': '大学精算学',
'college_education': '大学教育学',
'college_engineering_hydrology': '大学工程水文学',
'college_law': '大学法律',
'college_mathematics': '大学数学',
'college_medical_statistics': '大学医学统计',
'college_medicine': '大学医学',
'computer_science': '计算机科学',
'computer_security': '计算机安全',
'conceptual_physics': '概念物理学',
'construction_project_management': '建设工程管理',
'economics': '经济学',
'education': '教育学',
'electrical_engineering': '电气工程',
'elementary_chinese': '小学语文',
'elementary_commonsense': '小学常识',
'elementary_information_and_technology': '小学信息技术',
'elementary_mathematics': '初等数学',
'ethnology': '民族学',
'food_science': '食品科学',
'genetics': '遗传学',
'global_facts': '全球事实',
'high_school_biology': '高中生物',
'high_school_chemistry': '高中化学',
'high_school_geography': '高中地理',
'high_school_mathematics': '高中数学',
'high_school_physics': '高中物理学',
'high_school_politics': '高中政治',
'human_sexuality': '人类性行为',
'international_law': '国际法学',
'journalism': '新闻学',
'jurisprudence': '法理学',
'legal_and_moral_basis': '法律与道德基础',
'logical': '逻辑学',
'machine_learning': '机器学习',
'management': '管理学',
'marketing': '市场营销',
'marxist_theory': '马克思主义理论',
'modern_chinese': '现代汉语',
'nutrition': '营养学',
'philosophy': '哲学',
'professional_accounting': '专业会计',
'professional_law': '专业法学',
'professional_medicine': '专业医学',
'professional_psychology': '专业心理学',
'public_relations': '公共关系',
'security_study': '安全研究',
'sociology': '社会学',
'sports_science': '体育学',
'traditional_chinese_medicine': '中医中药',
'virology': '病毒学',
'world_history': '世界历史',
'world_religions': '世界宗教'
}
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
cmmlu_datasets = []
for _name in cmmlu_all_sets:
_ch_name = cmmlu_subject_mapping[_name]
cmmlu_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
),
dict(role='BOT', prompt='答案是: {answer}'),
]),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
cmmlu_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess))
cmmlu_datasets.append(
dict(
type=CMMLUDataset,
path='opencompass/cmmlu',
name=_name,
abbr=f'cmmlu-{_name}',
reader_cfg=dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='dev',
test_split='test'),
infer_cfg=cmmlu_infer_cfg,
eval_cfg=cmmlu_eval_cfg,
))
del _name, _ch_name
from mmengine.config import read_base
with read_base():
from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment