Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_gen_98fba6.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen_98fba6.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_subset_settings.py
+++ b/opencompass/configs/datasets/bbh/bbh_subset_settings.py
+settings = [
+    ('temporal_sequences', 'mcq'),
+    ('disambiguation_qa', 'mcq'),
+    ('date_understanding', 'mcq'),
+    ('tracking_shuffled_objects_three_objects', 'mcq'),
+    ('penguins_in_a_table', 'mcq'),
+    ('geometric_shapes', 'mcq'),
+    ('snarks', 'mcq'),
+    ('ruin_names', 'mcq'),
+    ('tracking_shuffled_objects_seven_objects', 'mcq'),
+    ('tracking_shuffled_objects_five_objects', 'mcq'),
+    ('logical_deduction_three_objects', 'mcq'),
+    ('hyperbaton', 'mcq'),
+    ('logical_deduction_five_objects', 'mcq'),
+    ('logical_deduction_seven_objects', 'mcq'),
+    ('movie_recommendation', 'mcq'),
+    ('salient_translation_error_detection', 'mcq'),
+    ('reasoning_about_colored_objects', 'mcq'),
+    ('multistep_arithmetic_two', 'free_form'),
+    ('navigate', 'free_form'),
+    ('dyck_languages', 'free_form'),
+    ('word_sorting', 'free_form'),
+    ('sports_understanding', 'free_form'),
+    ('boolean_expressions', 'free_form'),
+    ('object_counting', 'free_form'),
+    ('formal_fallacies', 'free_form'),
+    ('causal_judgement', 'free_form'),
+    ('web_of_lies', 'free_form'),
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .bigcodebench_full_complete_gen_faf748 import bigcodebench_full_complete_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+bigcodebench_full_reader_cfg = dict(
+        input_columns=['complete_prompt'],
+        output_column='test',
+)
+bigcodebench_full_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{complete_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+bigcodebench_full_complete_datasets = [
+    dict(
+        abbr='bigcodebench_full_complete',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_full_reader_cfg,
+        infer_cfg=bigcodebench_full_infer_cfg,
+        eval_cfg=bigcodebench_full_eval_cfg,
+        release_version='v0.1.2'
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .bigcodebench_full_instruct_gen_8815eb import bigcodebench_full_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+bigcodebench_full_reader_cfg = dict(
+        input_columns=['instruct_prompt'],
+        output_column='test',
+)
+bigcodebench_full_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{instruct_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=8192)
+)
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+bigcodebench_full_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_full_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_full_reader_cfg,
+        infer_cfg=bigcodebench_full_infer_cfg,
+        eval_cfg=bigcodebench_full_eval_cfg,
+        release_version='v0.1.2'
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .bigcodebench_hard_complete_gen_faf748 import bigcodebench_hard_complete_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+bigcodebench_hard_reader_cfg = dict(
+        input_columns=['complete_prompt'],
+        output_column='test',
+)
+bigcodebench_hard_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{complete_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+bigcodebench_hard_complete_datasets = [
+    dict(
+        abbr='bigcodebench_hard_complete',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+bigcodebench_hard_reader_cfg = dict(
+        input_columns=['instruct_prompt'],
+        output_column='test',
+)
+bigcodebench_hard_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{instruct_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=8192)
+)
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/calm/README.md
+++ b/opencompass/configs/datasets/calm/README.md
+# CaLM Lite
+**CaLM Lite** is a lightweight version of CaLM.
+**Ca**usal evaluation of **L**anguage **M**odels (CaLM), to the best of our knowledge, is the first comprehensive benchmark for evaluating the causal reasoning capabilities of language models. The CaLM framework establishes a foundational taxonomy consisting of four modules: causal target (i.e., what to evaluate), adaptation (i.e., how to obtain the results), metric (i.e., how to measure the results), and error (i.e., how to analyze the bad results).
+<div align="center">
+[🌐 Website](https://opencausalab.github.io/CaLM) |
+[📃 Report](https://arxiv.org/abs/2405.00622) |[ 🎆 Github](https://github.com/OpenCausaLab/CaLM) | 📧 Welcome to join us by email at causalai@pjlab.org.cn
+</div>
+## Quick Start
+### Data Preparation
+Download dataset to data/ folder.
+```
+wget https://github.com/OpenCausaLab/CaLM/releases/download/v1.0.0.lite/calm.zip
+unzip calm.zip
+```
+### Run Model and Infer
+To obtain a concise output with only the average information for all tasks, use:
+```
+python run.py --models YOUR_MODEL --datasets calm --summarizer calm
+```
+If you want detailed information for each task, use:
+```
+python run.py --models YOUR_MODEL --datasets calm
+```
+The `--summarizer calm` flag in the first command is used to generate a summarized output, while omitting it in the second command will provide task-specific details.
+## Available Causal Tasks
+We provide 92 tasks for causal evaluation, stored in the `data/calm` folder. For more information about our causal tasks, refer to [tasks](https://github.com/OpenCausaLab/CaLM/blob/main/documents/tasks.md).
+The directory structure is:
+```
+├── calm
+| ├── association
+| ├── causal_discovery # Rung of the causal ladder
+| │ ├── abstract_reasoning # Causal scenario
+| │ │ ├── AR-B_CaLM-AR_CN.json # Causal task
+| │ | └── AR-B_CaLM-AR_EN.json # Causal task
+| │ └── ...
+| └── ...
+└── ...
+```
+## Dataset
+- **Dataset size**: CaLM Lite leverages a light dataset of **9200**, while CaLM uses a significantly larger dataset of 126,334. The table below details the English dataset composition, with the Chinese version structured identically.
+- **Dataset configuration**: We prioritize balance in our dataset for **binary classification** and **choice selection** questions. By ensuring an equal number of each GT label, we minimize the risk of introducing bias into the model's testing. For **probability calculation**, CaLM-Lite takes extra attention to balance the number of problems across different causal reasoning processes. (For more details on how causal reasoning process is defined, please refer to Section 9.1.6 of the [paper](https://arxiv.org/abs/2405.00622).)
+- **Efficient evaluation**: For enhanced evaluation efficiency, OpenCompass offers customizable methods. Refer to the [documentation](https://opencompass.org.cn/doc) for guidance on tailoring these methods to your needs.
+| Causal ladder | Causal scenario | Subset | Question type | Mode | CaLM Lite | CaLM |
+|---------------|-----------------|--------|---------------|------|-----------|------|
+| Causal discovery | PCD | E-CARE | Binary classification | Natural | 100 | 2000 |
+| Causal discovery | PCD | E-CARE | Choice selection | Natural | 100 | 1000 |
+| Causal discovery | PCD | COPA | Binary classification | Natural | 100 | 2000 |
+| Causal discovery | PCD | COPA | Choice selection | Natural | 100 | 1000 |
+| Causal discovery | ECI | CTB | Binary classification | Natural | 100 | 596 |
+| Causal discovery | ECI | ESC | Binary classification | Natural | 100 | 1000 |
+| Causal discovery | ECI | MAVEN-ERE | Binary classification | Natural | 100 | 1000 |
+| Causal discovery | AR | CaLM-AR | Binary classification | Symbolic | 100 | 1600 |
+| Causal discovery | CA | FP | Binary classification | Symbolic | 100 | 1600 |
+| Causal discovery | CA | FA | Binary classification | Symbolic | 100 | 1600 |
+| Association | CORR | correlation | Binary classification | Natural | 100 | 1476 |
+| Association | EAE | exp-away | Binary classification | Natural | 100 | 168 |
+| Intervention | CB | collider-bias | Binary classification | Natural | 100 | 163 |
+| Intervention | ATE | ATE-natural | Binary classification | Natural | 100 | 1600 |
+| Intervention | ATE | ATE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | ATE | ATE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | CDE | CDE-natural | Binary classification | Natural | 100 | 1600 |
+| Intervention | CDE | CDE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | CDE | CDE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Intervention | BAS | backadj | Binary classification | Natural | 100 | 227 |
+| Intervention | BAS | max-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | BAS | min-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | BAS | mix-BAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | FAS | FAS | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | IV | CaLM-IV | Choice selection | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.2-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.4-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.6-UC | Binary classification | Symbolic | 100 | 1600 |
+| Intervention | CEI | 0.8-UC | Binary classification | Symbolic | 100 | 1600 |
+| Counterfactuals | ETT | ETT-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | ETT | ETT-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | ETT | ETT-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NDE | NDE-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | NDE | NDE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NDE | NDE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NIE | NIE-natural | Binary classification | Natural | 100 | 1600 |
+| Counterfactuals | NIE | NIE-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | NIE | NIE-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PN | PN-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PN | PN-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PS | PS-basic | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | PS | PS-hard | Probability calculation | Mathematical | 100 | 1600 |
+| Counterfactuals | AC | causal judgement | Binary classification | Natural | 100 | 187 |
+| Counterfactuals | CR | CRASS | Choice selection | Natural | 100 | 274 |
+| Counterfactuals | CR | det-counterfactual | Binary classification | Natural | 100 | 1476 |
+| Counterfactuals | CEG | E-CARE | Open-ended generation | Natural | 100 | 1000 |
+| **Total** | | | | | 4600 | 63167 |
+## Available Prompt Styles (Adaptation)
+Basic Prompt is our default setting for efficient evaluation of CaLM Lite, but we provide flexibility for exploring additional prompts through CaLM. If you'd like to explore and compare a wider range of prompts, we encourage you to use CaLM. We provide a comprehensive and easy-to-follow guide to assist you in our [repository](https://github.com/OpenCausaLab/CaLM).
+## Citation
+```
+@misc{chen2024causal,
+      title={Causal Evaluation of Language Models},
+      author={Sirui Chen and Bo Peng and Meiqi Chen and Ruiqi Wang and Mengying Xu and Xingyu Zeng and Rui Zhao and Shengjie Zhao and Yu Qiao and Chaochao Lu},
+      year={2024},
+      eprint={2405.00622},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/opencompass/configs/datasets/calm/calm.py
+++ b/opencompass/configs/datasets/calm/calm.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CaLMDataset, CaLMEvaluator
+task_hiearchy_dict = {
+    # association/
+        # correlation/
+            'CORR-B_correlation_CN':'association/correlation/',
+            'CORR-B_correlation_EN':'association/correlation/',
+        # explaining_away_effect/
+            'EAE-B_exp-away_CN':'association/explaining_away_effect/',
+            'EAE-B_exp-away_EN':'association/explaining_away_effect/',
+    # causal_discovery/
+        # abstract_reasoning/
+            'AR-B_CaLM-AR_CN':'causal_discovery/abstract_reasoning/',
+            'AR-B_CaLM-AR_EN':'causal_discovery/abstract_reasoning/',
+        # causal_attribution/
+            'CA-B_FA_CN':'causal_discovery/causal_attribution/',
+            'CA-B_FA_EN':'causal_discovery/causal_attribution/',
+            'CA-B_FP_CN':'causal_discovery/causal_attribution/',
+            'CA-B_FP_EN':'causal_discovery/causal_attribution/',
+        # event_causality_identification/
+            'ECI-B_CTB_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_CTB_EN':'causal_discovery/event_causality_identification/',
+            'ECI-B_ESC_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_ESC_EN':'causal_discovery/event_causality_identification/',
+            'ECI-B_MAVEN-ERE_CN':'causal_discovery/event_causality_identification/',
+            'ECI-B_MAVEN-ERE_EN':'causal_discovery/event_causality_identification/',
+        # pairwise_causal_discovery/
+            'PCD-B_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-B_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
+            'PCD-C_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
+    # counterfactual/
+        # actual_causality/
+            'AC-B_causal_judgement_CN':'counterfactual/actual_causality/',
+            'AC-B_causal_judgement_EN':'counterfactual/actual_causality/',
+        # causal_explanation_generation/
+            'CEG-O_E-CARE_CN':'counterfactual/causal_explanation_generation/',
+            'CEG-O_E-CARE_EN':'counterfactual/causal_explanation_generation/',
+        # counterfactual_reasoning/
+            'CR-B_det-counterfactual_CN':'counterfactual/counterfactual_reasoning/',
+            'CR-B_det-counterfactual_EN':'counterfactual/counterfactual_reasoning/',
+            'CR-C_CRASS_CN':'counterfactual/counterfactual_reasoning/',
+            'CR-C_CRASS_EN':'counterfactual/counterfactual_reasoning/',
+        # effect_of_the_treatment_on_the_treated/
+            'ETT-B_ETT-natural_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-B_ETT-natural_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-basic_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-basic_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-hard_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+            'ETT-P_ETT-hard_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
+        # natural_direct_effect/
+            'NDE-B_NDE-natural_CN':'counterfactual/natural_direct_effect/',
+            'NDE-B_NDE-natural_EN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-basic_CN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-basic_EN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-hard_CN':'counterfactual/natural_direct_effect/',
+            'NDE-P_NDE-hard_EN':'counterfactual/natural_direct_effect/',
+        # natural_indirect_effect/
+            'NIE-B_NIE-natural_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-B_NIE-natural_EN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-basic_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-basic_EN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-hard_CN':'counterfactual/natural_indirect_effect/',
+            'NIE-P_NIE-hard_EN':'counterfactual/natural_indirect_effect/',
+        # probability_of_necessity/
+            'PN-P_PN-basic_CN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-basic_EN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-hard_CN':'counterfactual/probability_of_necessity/',
+            'PN-P_PN-hard_EN':'counterfactual/probability_of_necessity/',
+        # probability_of_sufficiency/
+            'PS-P_PS-basic_CN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-basic_EN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-hard_CN':'counterfactual/probability_of_sufficiency/',
+            'PS-P_PS-hard_EN':'counterfactual/probability_of_sufficiency/',
+    # intervention/
+        # average_treatment_effect/
+            'ATE-B_ATE-natural_CN':'intervention/average_treatment_effect/',
+            'ATE-B_ATE-natural_EN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-basic_CN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-basic_EN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-hard_CN':'intervention/average_treatment_effect/',
+            'ATE-P_ATE-hard_EN':'intervention/average_treatment_effect/',
+        # backdoor_adjustment_set/
+            'BAS-B_backadj_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-B_backadj_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_max-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_max-BAS_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_min-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_min-BAS_EN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_mix-BAS_CN':'intervention/backdoor_adjustment_set/',
+            'BAS-C_mix-BAS_EN':'intervention/backdoor_adjustment_set/',
+        # causal_effect_identification/
+            'CEI-B_0.2-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.2-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.4-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.4-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.6-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.6-UC_EN':'intervention/causal_effect_identification/',
+            'CEI-B_0.8-UC_CN':'intervention/causal_effect_identification/',
+            'CEI-B_0.8-UC_EN':'intervention/causal_effect_identification/',
+        # collider_bias/
+            'CB-B_collider-bias_CN':'intervention/collider_bias/',
+            'CB-B_collider-bias_EN':'intervention/collider_bias/',
+        # controlled_direct_effect/
+            'CDE-B_CDE-natural_CN':'intervention/controlled_direct_effect/',
+            'CDE-B_CDE-natural_EN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-basic_CN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-basic_EN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-hard_CN':'intervention/controlled_direct_effect/',
+            'CDE-P_CDE-hard_EN':'intervention/controlled_direct_effect/',
+        # frontdoor_adjustment_set/
+            'FAS-C_FAS_CN':'intervention/frontdoor_adjustment_set/',
+            'FAS-C_FAS_EN':'intervention/frontdoor_adjustment_set/',
+        # instrumental_variable/
+            'IV-C_CaLM-IV_CN':'intervention/instrumental_variable/',
+            'IV-C_CaLM-IV_EN':'intervention/instrumental_variable/',}
+calm_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='gt_item')
+calm_all_sets = list(set(key[:-3] for key in task_hiearchy_dict.keys()))
+calm_datasets = []
+for _name in calm_all_sets:
+    for _prompt_style in ['basic','basic-CN']:
+        _task_name = _name + ('_CN' if _prompt_style.endswith('-CN') else '_EN')
+        _path = f'./data/calm/{task_hiearchy_dict[_task_name]}{_task_name}.json'
+        calm_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='{question}'),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=500))
+        calm_eval_cfg = dict(evaluator=dict(
+                type=CaLMEvaluator,
+                core_metrics=True,
+                error_analysis=True,
+                prompt_style=_prompt_style,
+                task=_task_name))
+        calm_datasets.append(
+            dict(
+                abbr=f'calm_{_task_name}',
+                type=CaLMDataset,
+                path=_path,
+                prompt_style=_prompt_style,
+                reader_cfg=calm_reader_cfg,
+                infer_cfg=calm_infer_cfg,
+                eval_cfg=calm_eval_cfg)
+        )
+del _prompt_style, _task_name, _path, _name
--- a/opencompass/configs/datasets/ceval/README.md
+++ b/opencompass/configs/datasets/ceval/README.md
+# C-Eval
+```bash
+python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
+```
+## Base Models
+|          model           |   ceval-test |   ceval-test-hard |   ceval-test-stem |   ceval-test-social-science |   ceval-test-humanities |   ceval-test-other |   ceval-dev |   ceval-dev-hard |   ceval-dev-stem |   ceval-dev-social-science |   ceval-dev-humanities |   ceval-dev-other |
+|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+|    llama-7b-turbomind    |        26.61 |             27.75 |             27.20 |                       26.31 |                   25.90 |              26.52 |       27.44 |            27.68 |            27.16 |                      29.49 |                  24.18 |             29.36 |
+|   llama-13b-turbomind    |        29.18 |             25.59 |             27.66 |                       33.86 |                   28.29 |              28.58 |       31.75 |            30.32 |            31.39 |                      35.22 |                  30.16 |             30.82 |
+|   llama-30b-turbomind    |        35.09 |             31.68 |             34.56 |                       39.89 |                   33.02 |              33.76 |       37.70 |            31.97 |            34.80 |                      42.72 |                  41.19 |             34.93 |
+|   llama-65b-turbomind    |        37.98 |             29.47 |             36.03 |                       45.03 |                   36.51 |              36.56 |       40.46 |            33.76 |            36.37 |                      46.47 |                  42.26 |             40.63 |
+|   llama-2-7b-turbomind   |        30.13 |             26.26 |             29.29 |                       33.02 |                   31.02 |              28.15 |       32.70 |            25.85 |            28.75 |                      39.75 |                  37.04 |             29.13 |
+|  llama-2-13b-turbomind   |        37.38 |             30.81 |             35.85 |                       43.98 |                   36.81 |              34.75 |       40.43 |            31.34 |            35.67 |                      45.75 |                  45.32 |             39.36 |
+|  llama-2-70b-turbomind   |        49.53 |             33.48 |             44.73 |                       60.19 |                   50.93 |              47.17 |       50.26 |            32.53 |            44.83 |                      59.44 |                  54.45 |             47.58 |
+|   llama-3-8b-turbomind   |        48.83 |             34.47 |             46.02 |                       56.48 |                   49.15 |              46.69 |       50.45 |            33.76 |            45.94 |                      58.08 |                  50.93 |             51.25 |
+|  llama-3-70b-turbomind   |        66.56 |             54.09 |             64.08 |                       76.43 |                   64.38 |              64.25 |       67.30 |            52.35 |            62.67 |                      77.89 |                  69.76 |             63.65 |
+| internlm2-1.8b-turbomind |        44.79 |             33.93 |             41.19 |                       54.26 |                   47.15 |              40.35 |       46.64 |            33.00 |            38.62 |                      57.28 |                  51.30 |             46.89 |
+|  internlm2-7b-turbomind  |        63.54 |             45.32 |             58.10 |                       76.40 |                   66.94 |              58.32 |       64.23 |            40.09 |            54.37 |                      76.88 |                  70.11 |             64.77 |
+| internlm2-20b-turbomind  |        67.28 |             50.15 |             62.33 |                       79.59 |                   70.55 |              61.82 |       66.73 |            42.50 |            59.25 |                      79.98 |                  73.43 |             61.56 |
+|   qwen-1.8b-turbomind    |        54.24 |             38.60 |             50.02 |                       68.18 |                   55.33 |              48.13 |       53.78 |            33.38 |            46.36 |                      68.40 |                  57.57 |             50.17 |
+|    qwen-7b-turbomind     |        62.06 |             42.73 |             56.21 |                       77.12 |                   65.28 |              55.76 |       63.23 |            36.99 |            54.74 |                      78.55 |                  68.94 |             59.02 |
+|    qwen-14b-turbomind    |        70.33 |             53.61 |             65.25 |                       83.19 |                   72.85 |              65.37 |       72.05 |            55.03 |            66.07 |                      85.59 |                  74.91 |             67.78 |
+|    qwen-72b-turbomind    |        83.25 |             66.78 |             78.44 |                       91.75 |                   83.86 |              83.63 |       83.60 |            63.68 |            78.05 |                      90.25 |                  87.13 |             84.13 |
+|     qwen1.5-0.5b-hf      |        48.36 |             35.55 |             44.72 |                       62.00 |                   48.51 |              42.41 |       50.43 |            37.00 |            46.28 |                      62.64 |                  48.11 |             49.18 |
+|     qwen1.5-1.8b-hf      |        58.67 |             40.98 |             53.91 |                       74.52 |                   58.51 |              53.06 |       59.38 |            43.02 |            53.45 |                      75.88 |                  60.06 |             54.47 |
+|      qwen1.5-4b-hf       |        66.55 |             48.50 |             61.45 |                       81.12 |                   67.90 |              61.22 |       66.46 |            43.12 |            56.76 |                      82.89 |                  67.61 |             68.03 |
+|      qwen1.5-7b-hf       |        72.49 |             52.90 |             66.77 |                       85.50 |                   74.37 |              69.19 |       73.57 |            49.16 |            66.32 |                      84.23 |                  77.30 |             73.34 |
+|      qwen1.5-14b-hf      |        76.93 |             60.50 |             72.08 |                       88.81 |                   77.95 |              73.94 |       77.86 |            54.81 |            71.55 |                      86.79 |                  82.86 |             76.23 |
+|      qwen1.5-32b-hf      |        82.50 |             66.67 |             77.97 |                       90.93 |                   83.66 |              81.88 |       82.79 |            71.06 |            80.01 |                      89.02 |                  83.36 |             81.62 |
+|      qwen1.5-72b-hf      |        83.03 |             65.09 |             77.90 |                       91.47 |                   83.85 |              83.86 |       83.72 |            64.09 |            77.26 |                      91.87 |                  87.64 |             84.14 |
+|   qwen1.5-moe-a2-7b-hf   |        76.67 |             51.37 |             68.89 |                       88.33 |                   77.15 |              79.73 |       77.90 |            51.25 |            67.27 |                      89.28 |                  83.16 |             81.60 |
+|    mistral-7b-v0.1-hf    |        43.76 |             33.85 |             42.23 |                       49.97 |                   41.10 |              43.54 |       47.54 |            33.97 |            44.74 |                      54.80 |                  51.52 |             42.06 |
+|    mistral-7b-v0.2-hf    |        42.81 |             32.84 |             41.00 |                       50.19 |                   39.45 |              42.77 |       46.44 |            31.67 |            42.89 |                      54.50 |                  48.75 |             43.23 |
+|   mixtral-8x7b-v0.1-hf   |        51.15 |             41.46 |             50.93 |                       59.19 |                   46.69 |              48.72 |       55.31 |            42.04 |            52.78 |                      62.00 |                  56.44 |             52.71 |
+|  mixtral-8x22b-v0.1-hf   |        58.13 |             48.31 |             58.01 |                       66.94 |                   53.60 |              54.86 |       60.50 |            45.67 |            57.44 |                      71.27 |                  61.31 |             55.47 |
+|         yi-6b-hf         |        70.78 |             43.72 |             60.54 |                       83.29 |                   75.39 |              73.40 |       73.13 |            46.87 |            63.14 |                      85.52 |                  78.70 |             74.45 |
+|        yi-34b-hf         |        80.93 |             58.51 |             73.48 |                       89.24 |                   83.65 |              84.18 |       81.62 |            56.95 |            71.64 |                      89.73 |                  87.49 |             86.53 |
+|   deepseek-7b-base-hf    |        43.68 |             28.90 |             37.03 |                       53.55 |                   50.14 |              40.34 |       45.07 |            31.94 |            38.81 |                      56.68 |                  47.10 |             43.85 |
+|   deepseek-67b-base-hf   |        66.66 |             44.25 |             57.89 |                       79.02 |                   72.36 |              65.66 |       66.65 |            38.62 |            56.65 |                      79.56 |                  73.72 |             66.01 |
+### Details on Test Split
+|          model           |   computer_network |   operating_system |   computer_architecture |   college_programming |   college_physics |   college_chemistry |   advanced_mathematics |   probability_and_statistics |   discrete_mathematics |   electrical_engineer |   metrology_engineer |   high_school_mathematics |
+|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+|    llama-7b-turbomind    |              29.82 |              25.70 |                   26.94 |                 30.99 |             32.95 |               23.66 |                  26.01 |                        22.89 |                  27.45 |                 30.09 |                26.48 |                     33.13 |
+|   llama-13b-turbomind    |              33.33 |              37.99 |                   31.09 |                 29.82 |             22.16 |               27.23 |                  31.79 |                        27.11 |                  24.84 |                 28.02 |                33.33 |                     30.72 |
+|   llama-30b-turbomind    |              40.94 |              48.60 |                   40.41 |                 34.21 |             32.95 |               35.71 |                  36.42 |                        32.53 |                  27.45 |                 31.56 |                36.07 |                     30.12 |
+|   llama-65b-turbomind    |              41.52 |              50.84 |                   44.04 |                 40.94 |             27.84 |               29.46 |                  28.32 |                        30.72 |                  29.41 |                 35.10 |                42.47 |                     30.12 |
+|   llama-2-7b-turbomind   |              33.92 |              37.99 |                   34.72 |                 30.99 |             26.70 |               21.88 |                  31.79 |                        25.30 |                  24.18 |                 31.56 |                39.73 |                     30.12 |
+|  llama-2-13b-turbomind   |              40.94 |              46.93 |                   37.82 |                 36.26 |             30.68 |               29.46 |                  35.84 |                        30.72 |                  24.84 |                 32.74 |                42.92 |                     34.94 |
+|  llama-2-70b-turbomind   |              55.56 |              58.66 |                   53.89 |                 47.95 |             34.09 |               33.48 |                  32.95 |                        27.11 |                  34.64 |                 37.76 |                57.99 |                     29.52 |
+|   llama-3-8b-turbomind   |              55.56 |              58.66 |                   55.96 |                 51.17 |             27.27 |               35.27 |                  36.42 |                        31.33 |                  34.64 |                 40.12 |                50.68 |                     30.72 |
+|  llama-3-70b-turbomind   |              69.59 |              75.98 |                   69.95 |                 71.64 |             49.43 |               58.04 |                  52.02 |                        53.01 |                  58.82 |                 45.72 |                68.95 |                     40.96 |
+| internlm2-1.8b-turbomind |              40.35 |              40.78 |                   39.38 |                 32.16 |             34.66 |               34.38 |                  31.21 |                        31.33 |                  35.95 |                 35.10 |                51.60 |                     27.71 |
+|  internlm2-7b-turbomind  |              56.14 |              57.54 |                   62.69 |                 49.42 |             43.75 |               48.21 |                  34.68 |                        32.53 |                  33.33 |                 41.00 |                60.27 |                     40.36 |
+| internlm2-20b-turbomind  |              62.57 |              65.36 |                   66.84 |                 58.77 |             43.18 |               51.79 |                  39.31 |                        40.36 |                  35.95 |                 42.77 |                66.67 |                     47.59 |
+|   qwen-1.8b-turbomind    |              46.20 |              41.90 |                   46.63 |                 36.84 |             40.34 |               36.61 |                  27.75 |                        28.92 |                  32.68 |                 36.58 |                57.08 |                     30.12 |
+|    qwen-7b-turbomind     |              52.63 |              54.75 |                   54.40 |                 46.20 |             35.80 |               44.20 |                  36.99 |                        27.71 |                  26.80 |                 38.35 |                57.99 |                     33.13 |
+|    qwen-14b-turbomind    |              58.48 |              64.80 |                   59.07 |                 54.68 |             45.45 |               57.59 |                  45.09 |                        33.73 |                  39.22 |                 49.26 |                67.58 |                     45.78 |
+|    qwen-72b-turbomind    |              83.04 |              73.74 |                   79.27 |                 76.61 |             75.00 |               64.29 |                  49.13 |                        44.58 |                  46.41 |                 66.37 |                85.84 |                     68.07 |
+|     qwen1.5-0.5b-hf      |              37.43 |              40.22 |                   41.45 |                 35.09 |             40.91 |               34.82 |                  30.06 |                        27.11 |                  26.80 |                 29.79 |                54.34 |                     31.93 |
+|     qwen1.5-1.8b-hf      |              47.37 |              50.84 |                   47.67 |                 38.30 |             43.18 |               35.27 |                  29.48 |                        30.12 |                  33.99 |                 39.53 |                58.90 |                     28.92 |
+|      qwen1.5-4b-hf       |              62.57 |              56.98 |                   56.99 |                 46.78 |             48.30 |               45.98 |                  40.46 |                        34.34 |                  31.37 |                 46.61 |                62.10 |                     43.37 |
+|      qwen1.5-7b-hf       |              66.08 |              62.57 |                   66.32 |                 55.56 |             54.55 |               47.77 |                  41.62 |                        31.93 |                  35.95 |                 49.85 |                74.43 |                     49.40 |
+|      qwen1.5-14b-hf      |              71.35 |              66.48 |                   68.39 |                 64.91 |             57.95 |               65.62 |                  41.62 |                        40.36 |                  47.71 |                 56.64 |                79.45 |                     56.63 |
+|      qwen1.5-32b-hf      |              84.80 |              73.18 |                   74.61 |                 70.18 |             71.59 |               61.61 |                  49.13 |                        45.78 |                  49.02 |                 61.95 |                87.67 |                     72.89 |
+|      qwen1.5-72b-hf      |              85.38 |              73.74 |                   78.24 |                 78.36 |             72.73 |               63.39 |                  43.35 |                        40.96 |                  49.02 |                 65.78 |                85.84 |                     66.27 |
+|   qwen1.5-moe-a2-7b-hf   |              77.78 |              73.74 |                   68.91 |                 64.91 |             66.48 |               49.11 |                  33.53 |                        36.75 |                  35.95 |                 61.06 |                91.32 |                     40.96 |
+|    mistral-7b-v0.1-hf    |              55.56 |              55.31 |                   56.99 |                 48.25 |             39.77 |               39.29 |                  33.53 |                        25.90 |                  31.37 |                 35.99 |                45.21 |                     27.11 |
+|    mistral-7b-v0.2-hf    |              56.14 |              53.63 |                   55.44 |                 47.66 |             36.36 |               34.38 |                  32.37 |                        25.30 |                  33.33 |                 31.86 |                45.21 |                     29.52 |
+|   mixtral-8x7b-v0.1-hf   |              62.57 |              64.80 |                   60.10 |                 60.53 |             38.64 |               42.41 |                  40.46 |                        37.35 |                  45.75 |                 35.99 |                60.27 |                     34.94 |
+|  mixtral-8x22b-v0.1-hf   |              65.50 |              74.86 |                   63.73 |                 65.79 |             46.59 |               52.68 |                  52.02 |                        45.78 |                  52.94 |                 42.77 |                62.56 |                     39.16 |
+|         yi-6b-hf         |              68.42 |              63.13 |                   69.43 |                 57.89 |             42.05 |               48.66 |                  31.79 |                        33.13 |                  28.76 |                 49.85 |                74.89 |                     37.35 |
+|        yi-34b-hf         |              83.63 |              80.45 |                   74.09 |                 68.42 |             62.50 |               60.27 |                  45.09 |                        38.55 |                  50.33 |                 65.19 |                88.58 |                     49.40 |
+|   deepseek-7b-base-hf    |              44.44 |              44.13 |                   44.56 |                 36.26 |             30.68 |               29.02 |                  32.37 |                        24.70 |                  26.14 |                 35.99 |                48.86 |                     28.31 |
+|   deepseek-67b-base-hf   |              63.16 |              70.39 |                   65.80 |                 59.36 |             42.61 |               45.54 |                  35.84 |                        38.55 |                  42.48 |                 44.54 |                68.95 |                     33.73 |
+|          model           |   high_school_physics |   high_school_chemistry |   high_school_biology |   middle_school_mathematics |   middle_school_biology |   middle_school_physics |   middle_school_chemistry |   veterinary_medicine |   college_economics |   business_administration |   marxism |   mao_zedong_thought |
+|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+|    llama-7b-turbomind    |                 29.14 |                   26.74 |                 24.57 |                       29.94 |                   22.92 |                   23.60 |                     20.00 |                 30.95 |               29.98 |                     24.58 |     25.70 |                25.11 |
+|   llama-13b-turbomind    |                 22.29 |                   18.60 |                 28.00 |                       26.55 |                   26.56 |                   25.28 |                     19.46 |                 29.05 |               28.77 |                     28.57 |     39.66 |                43.38 |
+|   llama-30b-turbomind    |                 25.14 |                   33.14 |                 36.00 |                       31.07 |                   39.06 |                   28.09 |                     33.51 |                 38.10 |               35.21 |                     35.88 |     48.04 |                33.33 |
+|   llama-65b-turbomind    |                 33.71 |                   26.16 |                 38.29 |                       33.90 |                   44.27 |                   36.52 |                     38.92 |                 38.10 |               37.42 |                     42.19 |     59.22 |                48.40 |
+|   llama-2-7b-turbomind   |                 26.86 |                   23.26 |                 26.86 |                       28.81 |                   28.12 |                   29.78 |                     22.70 |                 30.48 |               31.79 |                     30.56 |     33.52 |                36.07 |
+|  llama-2-13b-turbomind   |                 28.00 |                   31.98 |                 36.57 |                       36.72 |                   38.54 |                   36.52 |                     37.84 |                 46.67 |               37.02 |                     36.54 |     57.54 |                41.10 |
+|  llama-2-70b-turbomind   |                 40.00 |                   36.05 |                 48.00 |                       36.72 |                   66.67 |                   55.06 |                     55.68 |                 52.86 |               51.91 |                     48.50 |     68.16 |                60.73 |
+|   llama-3-8b-turbomind   |                 41.71 |                   38.37 |                 50.86 |                       36.16 |                   61.98 |                   63.48 |                     63.78 |                 56.19 |               41.65 |                     49.17 |     69.27 |                54.34 |
+|  llama-3-70b-turbomind   |                 63.43 |                   56.98 |                 69.14 |                       59.32 |                   84.90 |                   75.28 |                     78.92 |                 79.52 |               68.81 |                     59.80 |     86.59 |                79.91 |
+| internlm2-1.8b-turbomind |                 30.29 |                   45.93 |                 46.29 |                       33.33 |                   63.02 |                   60.11 |                     62.70 |                 47.62 |               35.61 |                     37.87 |     69.27 |                61.64 |
+|  internlm2-7b-turbomind  |                 64.57 |                   65.12 |                 76.00 |                       54.80 |                   91.15 |                   85.96 |                     90.27 |                 74.29 |               57.34 |                     50.50 |     86.59 |                83.56 |
+| internlm2-20b-turbomind  |                 68.57 |                   74.42 |                 78.86 |                       58.76 |                   91.67 |                   90.45 |                     90.27 |                 72.38 |               57.95 |                     55.81 |     88.83 |                88.58 |
+|   qwen-1.8b-turbomind    |                 55.43 |                   56.98 |                 61.14 |                       54.80 |                   85.42 |                   84.83 |                     85.41 |                 54.76 |               43.06 |                     44.19 |     83.80 |                79.91 |
+|    qwen-7b-turbomind     |                 68.00 |                   69.19 |                 82.86 |                       57.63 |                   93.75 |                   87.64 |                     92.43 |                 63.81 |               47.28 |                     57.48 |     86.59 |                82.65 |
+|    qwen-14b-turbomind    |                 78.86 |                   83.14 |                 92.57 |                       67.23 |                   96.88 |                   95.51 |                     96.76 |                 73.33 |               56.94 |                     64.45 |     91.62 |                86.76 |
+|    qwen-72b-turbomind    |                 93.14 |                   93.60 |                 95.43 |                       88.70 |                   98.44 |                   97.75 |                     99.46 |                 90.00 |               75.45 |                     80.73 |     96.09 |                99.54 |
+|     qwen1.5-0.5b-hf      |                 48.57 |                   44.19 |                 60.00 |                       40.68 |                   73.44 |                   69.66 |                     78.92 |                 49.05 |               34.41 |                     40.20 |     79.89 |                74.43 |
+|     qwen1.5-1.8b-hf      |                 58.86 |                   68.02 |                 76.00 |                       59.32 |                   91.15 |                   90.45 |                     87.03 |                 63.81 |               44.87 |                     48.50 |     86.03 |                90.41 |
+|      qwen1.5-4b-hf       |                 66.86 |                   77.33 |                 82.86 |                       68.93 |                   95.31 |                   92.70 |                     97.30 |                 71.90 |               51.31 |                     61.13 |     91.62 |                94.52 |
+|      qwen1.5-7b-hf       |                 79.43 |                   82.56 |                 91.43 |                       77.40 |                   96.88 |                   95.51 |                     96.22 |                 80.00 |               62.37 |                     69.77 |     93.30 |                97.26 |
+|      qwen1.5-14b-hf      |                 86.29 |                   87.79 |                 93.14 |                       83.05 |                   97.92 |                   95.51 |                     97.84 |                 82.86 |               63.78 |                     77.08 |     95.53 |                96.35 |
+|      qwen1.5-32b-hf      |                 88.00 |                   95.35 |                 94.86 |                       91.53 |                   97.92 |                   99.44 |                    100.00 |                 90.00 |               73.44 |                     78.74 |     94.97 |                98.63 |
+|      qwen1.5-72b-hf      |                 91.43 |                   93.60 |                 95.43 |                       88.70 |                   97.92 |                   98.31 |                     99.46 |                 90.00 |               74.25 |                     80.40 |     94.41 |                98.63 |
+|   qwen1.5-moe-a2-7b-hf   |                 70.86 |                   77.33 |                 82.86 |                       68.36 |                   97.92 |                   93.26 |                     97.30 |                 89.52 |               70.22 |                     74.75 |     96.09 |                98.17 |
+|    mistral-7b-v0.1-hf    |                 33.14 |                   40.70 |                 40.57 |                       40.11 |                   47.92 |                   49.44 |                     50.81 |                 47.62 |               44.87 |                     37.87 |     58.10 |                48.40 |
+|    mistral-7b-v0.2-hf    |                 34.86 |                   36.63 |                 45.71 |                       36.72 |                   46.35 |                   46.07 |                     48.65 |                 43.81 |               43.46 |                     39.53 |     57.54 |                48.86 |
+|   mixtral-8x7b-v0.1-hf   |                 49.71 |                   42.44 |                 53.71 |                       47.46 |                   62.50 |                   61.24 |                     60.00 |                 57.62 |               52.52 |                     44.52 |     68.72 |                57.99 |
+|  mixtral-8x22b-v0.1-hf   |                 54.29 |                   43.02 |                 58.29 |                       55.93 |                   76.04 |                   66.29 |                     75.68 |                 66.19 |               60.97 |                     51.83 |     74.30 |                70.78 |
+|         yi-6b-hf         |                 58.86 |                   69.19 |                 78.29 |                       43.50 |                   92.19 |                   89.33 |                     90.27 |                 83.81 |               59.56 |                     70.10 |     93.85 |                97.72 |
+|        yi-34b-hf         |                 80.00 |                   81.98 |                 93.14 |                       65.54 |                   97.40 |                   95.51 |                     96.76 |                 92.86 |               74.04 |                     76.08 |     94.97 |                97.26 |
+|   deepseek-7b-base-hf    |                 29.14 |                   30.81 |                 33.14 |                       24.29 |                   53.12 |                   45.51 |                     48.65 |                 50.48 |               38.23 |                     44.19 |     62.01 |                65.30 |
+|   deepseek-67b-base-hf   |                 60.00 |                   55.23 |                 64.00 |                       46.33 |                   84.90 |                   79.78 |                     83.24 |                 73.33 |               57.75 |                     63.79 |     89.94 |                88.58 |
+|          model           |   education_science |   teacher_qualification |   high_school_politics |   high_school_geography |   middle_school_politics |   middle_school_geography |   modern_chinese_history |   ideological_and_moral_cultivation |   logic |   law |   chinese_language_and_literature |   art_studies |
+|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+|    llama-7b-turbomind    |               22.96 |                   31.58 |                  25.57 |                   29.78 |                    22.80 |                     25.00 |                    21.70 |                               21.51 |   25.00 | 26.24 |                             22.49 |         25.84 |
+|   llama-13b-turbomind    |               29.26 |                   30.83 |                  33.52 |                   36.52 |                    34.72 |                     33.33 |                    24.06 |                               40.12 |   26.47 | 33.48 |                             30.14 |         29.87 |
+|   llama-30b-turbomind    |               37.41 |                   46.37 |                  32.95 |                   38.20 |                    50.78 |                     40.74 |                    28.77 |                               45.93 |   33.33 | 32.13 |                             39.23 |         22.82 |
+|   llama-65b-turbomind    |               39.63 |                   51.13 |                  31.82 |                   39.89 |                    58.03 |                     42.59 |                    34.91 |                               55.23 |   39.71 | 30.32 |                             37.80 |         32.89 |
+|   llama-2-7b-turbomind   |               27.78 |                   34.34 |                  31.82 |                   34.83 |                    35.23 |                     34.26 |                    28.77 |                               38.95 |   32.35 | 33.94 |                             27.27 |         30.87 |
+|  llama-2-13b-turbomind   |               41.48 |                   47.37 |                  37.50 |                   37.64 |                    50.78 |                     52.78 |                    43.40 |                               48.84 |   32.35 | 38.46 |                             36.36 |         30.20 |
+|  llama-2-70b-turbomind   |               57.78 |                   69.17 |                  50.57 |                   58.43 |                    69.95 |                     66.67 |                    50.94 |                               72.09 |   50.98 | 42.53 |                             44.98 |         52.01 |
+|   llama-3-8b-turbomind   |               56.30 |                   65.41 |                  47.16 |                   56.18 |                    64.25 |                     61.11 |                    55.66 |                               67.44 |   41.67 | 40.27 |                             45.45 |         50.34 |
+|  llama-3-70b-turbomind   |               72.22 |                   85.46 |                  75.00 |                   74.72 |                    84.97 |                     76.85 |                    75.00 |                               76.16 |   59.31 | 52.94 |                             62.68 |         68.46 |
+| internlm2-1.8b-turbomind |               47.41 |                   61.40 |                  55.11 |                   47.75 |                    61.66 |                     64.81 |                    61.79 |                               63.95 |   32.35 | 32.58 |                             48.33 |         36.58 |
+|  internlm2-7b-turbomind  |               66.67 |                   85.96 |                  78.98 |                   74.72 |                    91.71 |                     87.96 |                    80.66 |                               80.23 |   42.16 | 50.23 |                             64.11 |         70.13 |
+| internlm2-20b-turbomind  |               69.26 |                   89.22 |                  83.52 |                   80.34 |                    90.67 |                     91.67 |                    83.02 |                               85.47 |   49.02 | 54.30 |                             72.25 |         73.15 |
+|   qwen-1.8b-turbomind    |               51.11 |                   70.68 |                  71.02 |                   62.36 |                    88.60 |                     87.04 |                    69.81 |                               73.26 |   29.90 | 46.15 |                             50.24 |         47.32 |
+|    qwen-7b-turbomind     |               57.41 |                   83.71 |                  88.64 |                   79.78 |                    93.26 |                     94.44 |                    75.47 |                               79.07 |   42.16 | 47.96 |                             59.33 |         65.10 |
+|    qwen-14b-turbomind    |               72.96 |                   89.97 |                  93.75 |                   83.71 |                    96.37 |                     95.37 |                    86.32 |                               87.21 |   50.00 | 60.63 |                             66.99 |         72.48 |
+|    qwen-72b-turbomind    |               85.56 |                   96.24 |                  95.45 |                   93.26 |                    97.93 |                     97.22 |                    92.45 |                               91.86 |   67.65 | 76.92 |                             75.12 |         83.89 |
+|     qwen1.5-0.5b-hf      |               43.33 |                   63.16 |                  65.91 |                   56.18 |                    82.90 |                     79.63 |                    68.87 |                               70.35 |   28.43 | 37.56 |                             39.23 |         32.21 |
+|     qwen1.5-1.8b-hf      |               57.41 |                   76.44 |                  81.25 |                   75.84 |                    92.75 |                     91.67 |                    79.72 |                               81.98 |   34.31 | 47.96 |                             47.85 |         43.62 |
+|      qwen1.5-4b-hf       |               65.93 |                   87.47 |                  86.93 |                   82.58 |                    94.30 |                     95.37 |                    84.91 |                               84.30 |   40.20 | 62.90 |                             58.85 |         58.72 |
+|      qwen1.5-7b-hf       |               69.26 |                   91.98 |                  90.91 |                   89.89 |                    95.85 |                     94.44 |                    89.15 |                               87.21 |   48.04 | 67.87 |                             63.16 |         68.12 |
+|      qwen1.5-14b-hf      |               78.89 |                   94.99 |                  94.89 |                   91.57 |                    96.89 |                     98.15 |                    91.04 |                               88.37 |   57.84 | 69.68 |                             66.99 |         73.83 |
+|      qwen1.5-32b-hf      |               83.70 |                   95.99 |                  93.75 |                   94.38 |                    98.45 |                     97.22 |                    90.57 |                               91.28 |   70.10 | 76.92 |                             76.56 |         80.87 |
+|      qwen1.5-72b-hf      |               84.44 |                   96.49 |                  96.59 |                   93.82 |                    98.45 |                     97.22 |                    92.92 |                               91.28 |   66.67 | 76.92 |                             74.16 |         85.23 |
+|   qwen1.5-moe-a2-7b-hf   |               80.74 |                   95.49 |                  89.20 |                   89.33 |                    94.82 |                     94.44 |                    92.45 |                               91.28 |   52.45 | 75.57 |                             67.94 |         79.87 |
+|    mistral-7b-v0.1-hf    |               45.19 |                   59.15 |                  43.75 |                   49.44 |                    56.48 |                     56.48 |                    45.28 |                               58.14 |   37.75 | 38.91 |                             40.67 |         34.56 |
+|    mistral-7b-v0.2-hf    |               45.93 |                   58.65 |                  38.07 |                   48.31 |                    63.21 |                     58.33 |                    41.98 |                               54.07 |   35.78 | 40.27 |                             38.28 |         32.21 |
+|   mixtral-8x7b-v0.1-hf   |               57.04 |                   67.92 |                  53.41 |                   55.06 |                    69.95 |                     64.81 |                    47.64 |                               70.93 |   42.16 | 38.01 |                             46.41 |         36.58 |
+|  mixtral-8x22b-v0.1-hf   |               60.37 |                   72.68 |                  64.77 |                   65.17 |                    77.20 |                     71.30 |                    57.08 |                               75.00 |   49.51 | 43.44 |                             52.63 |         49.33 |
+|         yi-6b-hf         |               79.26 |                   92.48 |                  77.27 |                   76.40 |                    92.75 |                     93.52 |                    89.15 |                               90.12 |   60.78 | 74.66 |                             61.24 |         74.16 |
+|        yi-34b-hf         |               84.81 |                   96.24 |                  88.07 |                   88.20 |                    96.37 |                     96.30 |                    91.98 |                               91.28 |   75.00 | 78.73 |                             80.38 |         82.89 |
+|   deepseek-7b-base-hf    |               52.22 |                   70.18 |                  47.16 |                   51.12 |                    60.62 |                     44.44 |                    58.49 |                               66.86 |   31.86 | 37.56 |                             53.11 |         61.07 |
+|   deepseek-67b-base-hf   |               76.67 |                   89.22 |                  77.27 |                   78.65 |                    89.64 |                     78.70 |                    85.85 |                               84.30 |   50.00 | 64.25 |                             69.38 |         84.23 |
+|          model           |   professional_tour_guide |   legal_professional |   high_school_chinese |   high_school_history |   middle_school_history |   civil_servant |   sports_science |   plant_protection |   basic_medicine |   clinical_medicine |   urban_and_rural_planner |   accountant |
+|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+|    llama-7b-turbomind    |                     29.70 |                23.72 |                 27.53 |                 30.22 |                   30.92 |           27.04 |            22.78 |              28.64 |            28.00 |               25.00 |                     26.32 |        29.80 |
+|   llama-13b-turbomind    |                     25.94 |                20.93 |                 25.84 |                 29.67 |                   24.64 |           29.60 |            26.67 |              29.15 |            33.71 |               25.50 |                     28.47 |        28.44 |
+|   llama-30b-turbomind    |                     29.32 |                27.91 |                 30.34 |                 36.26 |                   37.20 |           36.13 |            36.11 |              38.69 |            34.29 |               29.50 |                     38.52 |        29.35 |
+|   llama-65b-turbomind    |                     28.95 |                30.70 |                 30.90 |                 44.51 |                   35.75 |           36.60 |            45.56 |              39.20 |            37.71 |               30.00 |                     39.47 |        37.02 |
+|   llama-2-7b-turbomind   |                     29.70 |                30.23 |                 24.72 |                 29.67 |                   34.78 |           30.07 |            31.11 |              31.16 |            30.29 |               25.50 |                     31.34 |        27.31 |
+|  llama-2-13b-turbomind   |                     30.83 |                32.56 |                 24.16 |                 42.31 |                   45.41 |           32.87 |            36.67 |              45.23 |            38.29 |               33.50 |                     35.17 |        34.31 |
+|  llama-2-70b-turbomind   |                     53.76 |                38.14 |                 30.34 |                 58.79 |                   65.70 |           43.82 |            51.11 |              58.29 |            49.71 |               42.00 |                     49.76 |        46.28 |
+|   llama-3-8b-turbomind   |                     52.63 |                42.33 |                 27.53 |                 51.65 |                   65.70 |           44.52 |            54.44 |              51.26 |            46.86 |               43.00 |                     46.41 |        45.15 |
+|  llama-3-70b-turbomind   |                     72.93 |                52.56 |                 32.58 |                 71.98 |                   83.57 |           56.88 |            69.44 |              78.89 |            76.00 |               67.50 |                     57.89 |        59.14 |
+| internlm2-1.8b-turbomind |                     51.50 |                38.14 |                 25.84 |                 56.04 |                   71.50 |           47.32 |            35.00 |              43.72 |            42.29 |               39.00 |                     41.15 |        36.57 |
+|  internlm2-7b-turbomind  |                     72.56 |                53.49 |                 52.25 |                 79.67 |                   90.82 |           62.00 |            62.78 |              64.32 |            66.86 |               59.50 |                     55.74 |        53.50 |
+| internlm2-20b-turbomind  |                     74.06 |                54.42 |                 56.18 |                 81.87 |                   92.27 |           61.77 |            68.33 |              69.85 |            68.00 |               63.50 |                     60.77 |        58.92 |
+|   qwen-1.8b-turbomind    |                     54.14 |                43.72 |                 39.89 |                 69.23 |                   85.02 |           49.88 |            45.56 |              48.74 |            48.57 |               51.50 |                     46.89 |        45.82 |
+|    qwen-7b-turbomind     |                     71.05 |                48.37 |                 53.93 |                 81.87 |                   93.72 |           59.67 |            54.44 |              62.31 |            58.29 |               57.50 |                     50.24 |        56.66 |
+|    qwen-14b-turbomind    |                     79.70 |                53.02 |                 63.48 |                 87.36 |                   94.20 |           71.33 |            63.33 |              71.36 |            73.14 |               68.00 |                     59.09 |        67.95 |
+|    qwen-72b-turbomind    |                     90.23 |                77.21 |                 79.21 |                 91.76 |                   96.14 |           77.86 |            86.11 |              85.43 |            91.43 |               90.50 |                     76.08 |        86.68 |
+|     qwen1.5-0.5b-hf      |                     44.36 |                36.74 |                 39.33 |                 58.24 |                   78.26 |           43.36 |            40.00 |              45.23 |            41.71 |               42.50 |                     43.54 |        43.12 |
+|     qwen1.5-1.8b-hf      |                     59.40 |                47.91 |                 37.08 |                 72.53 |                   91.30 |           53.61 |            53.33 |              51.26 |            49.71 |               58.00 |                     51.20 |        56.21 |
+|      qwen1.5-4b-hf       |                     65.04 |                58.60 |                 55.62 |                 83.52 |                   94.20 |           62.00 |            63.89 |              65.33 |            65.71 |               64.00 |                     55.26 |        61.40 |
+|      qwen1.5-7b-hf       |                     78.57 |                66.51 |                 66.85 |                 87.91 |                   94.69 |           68.07 |            65.00 |              64.82 |            77.14 |               77.50 |                     60.77 |        74.49 |
+|      qwen1.5-14b-hf      |                     83.08 |                72.09 |                 70.22 |                 90.11 |                   94.20 |           69.46 |            73.89 |              70.35 |            82.29 |               83.00 |                     65.31 |        78.33 |
+|      qwen1.5-32b-hf      |                     87.59 |                78.14 |                 79.78 |                 92.86 |                   95.65 |           78.32 |            80.56 |              79.90 |            90.29 |               89.00 |                     77.27 |        86.68 |
+|      qwen1.5-72b-hf      |                     91.35 |                76.74 |                 79.21 |                 91.76 |                   96.14 |           79.25 |            85.56 |              86.93 |            92.00 |               90.00 |                     75.84 |        86.91 |
+|   qwen1.5-moe-a2-7b-hf   |                     88.35 |                75.81 |                 51.12 |                 79.12 |                   94.69 |           67.37 |            80.56 |              73.37 |            87.43 |               84.00 |                     78.23 |        82.39 |
+|    mistral-7b-v0.1-hf    |                     40.23 |                39.07 |                 24.16 |                 41.21 |                   52.17 |           41.49 |            45.00 |              52.26 |            45.14 |               42.00 |                     42.58 |        44.02 |
+|    mistral-7b-v0.2-hf    |                     36.84 |                34.88 |                 23.03 |                 43.96 |                   52.66 |           40.79 |            50.00 |              50.75 |            45.14 |               40.50 |                     42.58 |        40.86 |
+|   mixtral-8x7b-v0.1-hf   |                     47.74 |                40.00 |                 28.09 |                 57.14 |                   58.94 |           44.29 |            58.33 |              53.77 |            48.57 |               46.00 |                     51.20 |        46.50 |
+|  mixtral-8x22b-v0.1-hf   |                     59.02 |                41.86 |                 29.78 |                 60.99 |                   71.01 |           50.82 |            57.78 |              67.34 |            62.29 |               52.00 |                     53.35 |        55.98 |
+|         yi-6b-hf         |                     85.34 |                67.91 |                 53.93 |                 80.22 |                   91.79 |           65.97 |            72.22 |              72.36 |            82.29 |               84.50 |                     69.86 |        71.56 |
+|        yi-34b-hf         |                     94.36 |                76.74 |                 65.73 |                 87.91 |                   95.17 |           79.25 |            85.56 |              90.95 |            90.86 |               92.00 |                     76.79 |        82.39 |
+|   deepseek-7b-base-hf    |                     65.79 |                29.30 |                 32.58 |                 47.80 |                   67.15 |           37.76 |            44.44 |              52.26 |            43.43 |               36.50 |                     41.15 |        37.02 |
+|   deepseek-67b-base-hf   |                     83.83 |                58.60 |                 45.51 |                 79.67 |                   90.34 |           62.47 |            70.56 |              70.85 |            81.14 |               71.50 |                     61.72 |        60.05 |
+|          model           |   fire_engineer |   environmental_impact_assessment_engineer |   tax_accountant |   physician |
+|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+|    llama-7b-turbomind    |           22.34 |                                      24.91 |            29.12 |       27.77 |
+|   llama-13b-turbomind    |           24.11 |                                      30.25 |            27.77 |       30.70 |
+|   llama-30b-turbomind    |           28.72 |                                      31.67 |            31.83 |       36.57 |
+|   llama-65b-turbomind    |           28.37 |                                      39.15 |            33.63 |       35.44 |
+|   llama-2-7b-turbomind   |           22.70 |                                      24.91 |            25.51 |       29.80 |
+|  llama-2-13b-turbomind   |           25.53 |                                      35.94 |            29.35 |       35.44 |
+|  llama-2-70b-turbomind   |           36.52 |                                      52.67 |            36.12 |       52.60 |
+|   llama-3-8b-turbomind   |           35.46 |                                      49.82 |            41.31 |       55.30 |
+|  llama-3-70b-turbomind   |           48.58 |                                      64.41 |            52.60 |       75.40 |
+| internlm2-1.8b-turbomind |           32.27 |                                      42.35 |            39.05 |       45.15 |
+|  internlm2-7b-turbomind  |           46.81 |                                      55.16 |            47.63 |       67.27 |
+| internlm2-20b-turbomind  |           45.04 |                                      62.63 |            51.47 |       69.75 |
+|   qwen-1.8b-turbomind    |           41.84 |                                      47.69 |            45.60 |       57.34 |
+|    qwen-7b-turbomind     |           41.84 |                                      54.80 |            48.08 |       69.53 |
+|    qwen-14b-turbomind    |           45.74 |                                      64.77 |            56.43 |       77.88 |
+|    qwen-72b-turbomind    |           80.50 |                                      74.73 |            81.04 |       89.62 |
+|     qwen1.5-0.5b-hf      |           39.36 |                                      41.28 |            38.37 |       48.08 |
+|     qwen1.5-1.8b-hf      |           45.74 |                                      49.47 |            51.69 |       63.43 |
+|      qwen1.5-4b-hf       |           50.35 |                                      51.60 |            58.69 |       75.17 |
+|      qwen1.5-7b-hf       |           58.51 |                                      65.84 |            67.04 |       81.94 |
+|      qwen1.5-14b-hf      |           63.83 |                                      67.26 |            72.23 |       87.36 |
+|      qwen1.5-32b-hf      |           74.47 |                                      73.31 |            80.14 |       90.74 |
+|      qwen1.5-72b-hf      |           79.79 |                                      75.09 |            81.04 |       90.07 |
+|   qwen1.5-moe-a2-7b-hf   |           74.82 |                                      77.58 |            79.68 |       91.65 |
+|    mistral-7b-v0.1-hf    |           32.27 |                                      45.91 |            37.70 |       50.56 |
+|    mistral-7b-v0.2-hf    |           32.62 |                                      44.13 |            36.79 |       46.28 |
+|   mixtral-8x7b-v0.1-hf   |           35.11 |                                      53.02 |            46.73 |       52.37 |
+|  mixtral-8x22b-v0.1-hf   |           38.65 |                                      56.23 |            49.21 |       59.82 |
+|         yi-6b-hf         |           67.38 |                                      68.68 |            69.53 |       83.07 |
+|        yi-34b-hf         |           77.66 |                                      83.27 |            77.43 |       89.84 |
+|   deepseek-7b-base-hf    |           30.50 |                                      38.79 |            35.67 |       46.28 |
+|   deepseek-67b-base-hf   |           46.81 |                                      65.12 |            54.40 |       77.65 |
+### Details on Dev Split
+## Chat Models
+|             model             |   ceval-test |   ceval-test-hard |   ceval-test-stem |   ceval-test-social-science |   ceval-test-humanities |   ceval-test-other |   ceval-dev |   ceval-dev-hard |   ceval-dev-stem |   ceval-dev-social-science |   ceval-dev-humanities |   ceval-dev-other |
+|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+|     qwen1.5-0.5b-chat-hf      |        36.88 |             28.83 |             34.49 |                       43.46 |                   37.35 |              34.76 |       38.58 |            33.90 |            33.63 |                      43.81 |                  41.79 |             39.59 |
+|     qwen1.5-1.8b-chat-hf      |        55.17 |             38.21 |             50.63 |                       70.26 |                   56.04 |              48.82 |       55.93 |            37.60 |            50.31 |                      67.59 |                  60.90 |             50.59 |
+|      qwen1.5-4b-chat-hf       |        61.54 |             44.79 |             56.86 |                       75.84 |                   62.13 |              56.46 |       62.76 |            38.32 |            55.39 |                      79.53 |                  65.67 |             58.00 |
+|      qwen1.5-7b-chat-hf       |        68.71 |             51.77 |             64.27 |                       81.23 |                   68.22 |              65.88 |       71.10 |            50.13 |            65.42 |                      83.99 |                  73.77 |             67.02 |
+|      qwen1.5-14b-chat-hf      |        74.80 |             56.54 |             69.46 |                       87.47 |                   76.46 |              71.32 |       76.35 |            52.08 |            69.68 |                      86.70 |                  80.56 |             74.87 |
+|      qwen1.5-32b-chat-hf      |        80.47 |             63.17 |             75.66 |                       89.58 |                   81.98 |              79.43 |       81.27 |            63.51 |            76.64 |                      89.39 |                  82.97 |             80.59 |
+|      qwen1.5-72b-chat-hf      |        81.53 |             63.62 |             75.86 |                       90.74 |                   83.18 |              81.84 |       82.88 |            62.44 |            77.54 |                      89.80 |                  86.11 |             83.07 |
+|     qwen1.5-110b-chat-hf      |        87.33 |             67.27 |             80.70 |                       93.58 |                   89.67 |              91.35 |       87.59 |            73.64 |            81.94 |                      91.47 |                  92.12 |             89.80 |
+|    internlm2-chat-1.8b-hf     |        47.04 |             34.81 |             43.28 |                       59.34 |                   48.24 |              41.50 |       48.51 |            36.75 |            42.23 |                      57.79 |                  54.83 |             45.15 |
+|  internlm2-chat-1.8b-sft-hf   |        47.19 |             35.34 |             43.49 |                       59.56 |                   48.30 |              41.58 |       48.75 |            35.83 |            42.04 |                      59.80 |                  54.84 |             44.83 |
+|     internlm2-chat-7b-hf      |        58.75 |             39.61 |             52.38 |                       71.46 |                   61.57 |              55.96 |       61.04 |            36.56 |            51.81 |                      74.01 |                  69.13 |             57.92 |
+|   internlm2-chat-7b-sft-hf    |        58.96 |             40.09 |             52.40 |                       71.49 |                   62.20 |              56.26 |       61.02 |            37.29 |            52.60 |                      74.01 |                  68.27 |             57.27 |
+|     internlm2-chat-20b-hf     |        63.12 |             42.65 |             56.21 |                       75.64 |                   67.15 |              60.27 |       63.45 |            34.96 |            52.84 |                      79.27 |                  71.50 |             60.32 |
+|   internlm2-chat-20b-sft-hf   |        63.16 |             42.70 |             56.19 |                       75.74 |                   67.20 |              60.37 |       63.54 |            34.96 |            52.57 |                      80.33 |                  71.42 |             60.34 |
+|    llama-3-8b-instruct-hf     |        50.90 |             34.54 |             46.73 |                       58.73 |                   49.24 |              53.04 |       52.55 |            36.37 |            48.47 |                      58.03 |                  53.26 |             54.26 |
+|    llama-3-70b-instruct-hf    |        67.38 |             54.02 |             65.16 |                       76.83 |                   62.29 |              67.92 |       67.92 |            54.50 |            66.85 |                      76.80 |                  65.98 |             63.72 |
+| llama-3-8b-instruct-lmdeploy  |        49.92 |             34.75 |             46.19 |                       58.49 |                   47.68 |              51.14 |       50.27 |            33.32 |            46.25 |                      56.93 |                  49.02 |             52.76 |
+| llama-3-70b-instruct-lmdeploy |        66.41 |             52.76 |             64.72 |                       75.31 |                   61.36 |              66.44 |       68.21 |            52.28 |            65.86 |                      75.06 |                  68.37 |             66.09 |
+|  mistral-7b-instruct-v0.1-hf  |        36.76 |             27.76 |             35.55 |                       42.41 |                   34.45 |              36.12 |       40.04 |            30.21 |            35.77 |                      45.15 |                  40.99 |             42.22 |
+|  mistral-7b-instruct-v0.2-hf  |        40.38 |             30.26 |             38.82 |                       47.66 |                   37.08 |              39.91 |       43.00 |            25.97 |            38.60 |                      47.44 |                  48.15 |             41.82 |
+| mixtral-8x7b-instruct-v0.1-hf |        49.61 |             37.78 |             47.86 |                       58.56 |                   46.40 |              47.85 |       51.68 |            37.41 |            49.14 |                      59.79 |                  52.97 |             47.65 |
+### Details on Test Split
+|             model             |   computer_network |   operating_system |   computer_architecture |   college_programming |   college_physics |   college_chemistry |   advanced_mathematics |   probability_and_statistics |   discrete_mathematics |   electrical_engineer |   metrology_engineer |   high_school_mathematics |
+|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+|     qwen1.5-0.5b-chat-hf      |              35.67 |              36.87 |                   33.68 |                 33.92 |             35.23 |               28.12 |                  27.17 |                        26.51 |                  24.84 |                 28.91 |                40.18 |                     25.90 |
+|     qwen1.5-1.8b-chat-hf      |              46.78 |              47.49 |                   50.78 |                 39.18 |             41.48 |               31.25 |                  32.95 |                        27.71 |                  28.10 |                 34.81 |                55.71 |                     27.11 |
+|      qwen1.5-4b-chat-hf       |              54.39 |              54.75 |                   54.92 |                 44.74 |             46.02 |               43.30 |                  39.31 |                        31.33 |                  28.10 |                 45.13 |                58.90 |                     43.98 |
+|      qwen1.5-7b-chat-hf       |              60.82 |              60.34 |                   63.21 |                 55.85 |             48.86 |               45.09 |                  46.24 |                        36.14 |                  39.22 |                 47.49 |                70.32 |                     45.78 |
+|      qwen1.5-14b-chat-hf      |              69.59 |              62.57 |                   64.77 |                 64.91 |             55.68 |               57.14 |                  49.13 |                        32.53 |                  43.14 |                 55.16 |                76.71 |                     46.99 |
+|      qwen1.5-32b-chat-hf      |              81.87 |              74.30 |                   73.58 |                 71.35 |             63.07 |               60.71 |                  50.87 |                        46.99 |                  47.06 |                 59.29 |                83.11 |                     60.84 |
+|      qwen1.5-72b-chat-hf      |              77.78 |              75.42 |                   76.17 |                 73.39 |             63.64 |               62.50 |                  45.09 |                        45.78 |                  48.37 |                 59.00 |                81.74 |                     60.84 |
+|     qwen1.5-110b-chat-hf      |              83.63 |              86.03 |                   81.87 |                 77.49 |             76.70 |               67.86 |                  49.13 |                        47.59 |                  55.56 |                 79.94 |                95.89 |                     62.05 |
+|    internlm2-chat-1.8b-hf     |              42.11 |              43.58 |                   44.56 |                 35.38 |             32.95 |               34.82 |                  32.95 |                        28.92 |                  32.68 |                 34.22 |                53.42 |                     31.93 |
+|  internlm2-chat-1.8b-sft-hf   |              42.11 |              44.13 |                   43.01 |                 35.09 |             34.09 |               36.16 |                  32.95 |                        27.11 |                  33.33 |                 35.10 |                51.14 |                     33.13 |
+|     internlm2-chat-7b-hf      |              59.65 |              60.89 |                   58.03 |                 51.46 |             36.93 |               43.75 |                  36.99 |                        29.52 |                  36.60 |                 39.82 |                63.47 |                     38.55 |
+|   internlm2-chat-7b-sft-hf    |              59.06 |              61.45 |                   56.48 |                 52.63 |             39.77 |               41.52 |                  36.99 |                        27.71 |                  39.22 |                 40.12 |                62.10 |                     40.36 |
+|     internlm2-chat-20b-hf     |              61.99 |              70.39 |                   63.73 |                 54.97 |             33.52 |               47.77 |                  43.93 |                        40.96 |                  44.44 |                 44.25 |                61.64 |                     34.34 |
+|   internlm2-chat-20b-sft-hf   |              61.40 |              70.39 |                   63.21 |                 54.97 |             32.95 |               47.77 |                  42.20 |                        42.17 |                  43.14 |                 44.25 |                61.64 |                     32.53 |
+|    llama-3-8b-instruct-hf     |              57.31 |              58.10 |                   57.51 |                 51.17 |             28.41 |               35.27 |                  39.31 |                        32.53 |                  35.29 |                 38.05 |                55.25 |                     27.11 |
+|    llama-3-70b-instruct-hf    |              71.93 |              74.86 |                   70.98 |                 67.54 |             50.57 |               57.14 |                  52.60 |                        53.01 |                  56.21 |                 47.79 |                68.95 |                     43.98 |
+| llama-3-8b-instruct-lmdeploy  |              55.56 |              57.54 |                   55.44 |                 48.25 |             30.11 |               33.04 |                  35.84 |                        31.33 |                  33.33 |                 38.94 |                53.88 |                     31.93 |
+| llama-3-70b-instruct-lmdeploy |              70.76 |              77.09 |                   69.95 |                 67.84 |             49.43 |               54.02 |                  50.87 |                        54.22 |                  56.21 |                 47.20 |                69.86 |                     42.17 |
+|  mistral-7b-instruct-v0.1-hf  |              49.12 |              47.49 |                   43.52 |                 39.18 |             32.39 |               28.57 |                  29.48 |                        24.10 |                  28.10 |                 37.46 |                44.29 |                     23.49 |
+|  mistral-7b-instruct-v0.2-hf  |              47.95 |              53.07 |                   52.85 |                 42.69 |             28.41 |               26.79 |                  40.46 |                        30.12 |                  29.41 |                 33.33 |                42.92 |                     24.10 |
+| mixtral-8x7b-instruct-v0.1-hf |              58.48 |              62.57 |                   58.03 |                 56.43 |             38.64 |               36.16 |                  39.31 |                        34.94 |                  37.91 |                 34.81 |                55.71 |                     28.31 |
+|             model             |   high_school_physics |   high_school_chemistry |   high_school_biology |   middle_school_mathematics |   middle_school_biology |   middle_school_physics |   middle_school_chemistry |   veterinary_medicine |   college_economics |   business_administration |   marxism |   mao_zedong_thought |
+|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+|     qwen1.5-0.5b-chat-hf      |                 30.86 |                   31.98 |                 44.00 |                       27.68 |                   47.40 |                   40.45 |                     55.14 |                 35.24 |               32.80 |                     30.56 |     58.66 |                57.53 |
+|     qwen1.5-1.8b-chat-hf      |                 54.86 |                   62.21 |                 69.14 |                       53.67 |                   82.81 |                   83.15 |                     85.41 |                 58.10 |               44.06 |                     49.83 |     82.12 |                82.65 |
+|      qwen1.5-4b-chat-hf       |                 58.86 |                   67.44 |                 80.00 |                       55.93 |                   89.58 |                   88.20 |                     88.11 |                 64.29 |               47.08 |                     57.48 |     86.59 |                84.93 |
+|      qwen1.5-7b-chat-hf       |                 72.00 |                   80.81 |                 84.00 |                       70.06 |                   95.31 |                   94.94 |                     95.14 |                 73.81 |               56.94 |                     66.11 |     91.62 |                89.04 |
+|      qwen1.5-14b-chat-hf      |                 84.00 |                   83.72 |                 90.29 |                       80.23 |                   97.92 |                   94.94 |                     98.38 |                 81.43 |               63.18 |                     74.75 |     93.30 |                96.80 |
+|      qwen1.5-32b-chat-hf      |                 85.71 |                   90.12 |                 93.71 |                       85.31 |                   97.92 |                   98.31 |                    100.00 |                 89.05 |               69.82 |                     75.75 |     93.85 |                97.72 |
+|      qwen1.5-72b-chat-hf      |                 88.57 |                   94.19 |                 94.86 |                       85.31 |                   97.92 |                   97.75 |                     98.38 |                 90.48 |               71.63 |                     79.73 |     93.85 |                97.72 |
+|     qwen1.5-110b-chat-hf      |                 86.86 |                   92.44 |                 94.29 |                       85.31 |                   98.44 |                   98.88 |                     98.92 |                 95.24 |               78.87 |                     86.38 |     95.53 |                99.54 |
+|    internlm2-chat-1.8b-hf     |                 35.43 |                   48.84 |                 52.00 |                       35.03 |                   70.31 |                   67.98 |                     67.03 |                 41.43 |               37.83 |                     36.88 |     70.95 |                60.73 |
+|  internlm2-chat-1.8b-sft-hf   |                 37.71 |                   48.26 |                 53.14 |                       34.46 |                   71.35 |                   67.98 |                     67.57 |                 41.90 |               38.63 |                     37.54 |     72.63 |                60.27 |
+|     internlm2-chat-7b-hf      |                 46.29 |                   48.26 |                 60.57 |                       46.89 |                   78.65 |                   71.91 |                     71.35 |                 68.10 |               50.30 |                     50.83 |     77.09 |                76.26 |
+|   internlm2-chat-7b-sft-hf    |                 46.86 |                   48.26 |                 61.14 |                       45.76 |                   77.60 |                   71.91 |                     71.35 |                 67.62 |               50.10 |                     50.50 |     77.09 |                75.80 |
+|     internlm2-chat-20b-hf     |                 49.71 |                   46.51 |                 63.43 |                       55.37 |                   80.73 |                   74.72 |                     79.46 |                 72.38 |               55.73 |                     59.80 |     85.47 |                76.26 |
+|   internlm2-chat-20b-sft-hf   |                 53.71 |                   47.09 |                 64.00 |                       55.37 |                   80.73 |                   73.60 |                     78.92 |                 73.81 |               55.53 |                     60.13 |     85.47 |                75.80 |
+|    llama-3-8b-instruct-hf     |                 38.86 |                   39.53 |                 50.29 |                       40.11 |                   65.10 |                   60.11 |                     63.78 |                 61.43 |               47.89 |                     45.85 |     69.27 |                56.16 |
+|    llama-3-70b-instruct-hf    |                 63.43 |                   55.23 |                 69.71 |                       68.36 |                   85.42 |                   80.90 |                     78.38 |                 86.19 |               69.01 |                     65.12 |     83.24 |                82.65 |
+| llama-3-8b-instruct-lmdeploy  |                 41.71 |                   40.70 |                 52.00 |                       41.24 |                   61.46 |                   58.43 |                     65.41 |                 57.62 |               45.27 |                     46.18 |     69.27 |                55.71 |
+| llama-3-70b-instruct-lmdeploy |                 61.71 |                   53.49 |                 70.86 |                       64.97 |                   88.02 |                   83.71 |                     77.30 |                 84.76 |               68.21 |                     60.80 |     80.45 |                79.91 |
+|  mistral-7b-instruct-v0.1-hf  |                 27.43 |                   28.49 |                 36.00 |                       28.25 |                   40.10 |                   42.70 |                     43.78 |                 37.14 |               32.80 |                     37.87 |     41.90 |                48.86 |
+|  mistral-7b-instruct-v0.2-hf  |                 33.14 |                   29.65 |                 44.00 |                       31.07 |                   47.92 |                   44.94 |                     49.19 |                 44.29 |               37.02 |                     40.86 |     53.63 |                48.40 |
+| mixtral-8x7b-instruct-v0.1-hf |                 46.29 |                   40.70 |                 54.86 |                       42.37 |                   58.85 |                   60.67 |                     57.84 |                 54.29 |               50.10 |                     46.51 |     69.27 |                52.51 |
+|             model             |   education_science |   teacher_qualification |   high_school_politics |   high_school_geography |   middle_school_politics |   middle_school_geography |   modern_chinese_history |   ideological_and_moral_cultivation |   logic |   law |   chinese_language_and_literature |   art_studies |
+|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |               33.33 |                   46.12 |                  37.50 |                   37.08 |                    57.51 |                     43.52 |                    42.45 |                               51.74 |   32.84 | 31.22 |                             37.32 |         24.50 |
+|     qwen1.5-1.8b-chat-hf      |               54.07 |                   72.43 |                  74.43 |                   66.85 |                    89.12 |                     87.04 |                    77.36 |                               76.16 |   38.24 | 44.34 |                             46.89 |         40.94 |
+|      qwen1.5-4b-chat-hf       |               60.00 |                   84.71 |                  82.39 |                   69.66 |                    94.82 |                     90.74 |                    79.72 |                               78.49 |   41.67 | 57.47 |                             54.07 |         56.38 |
+|      qwen1.5-7b-chat-hf       |               66.30 |                   90.73 |                  84.66 |                   80.90 |                    94.30 |                     91.67 |                    82.55 |                               84.88 |   38.73 | 60.18 |                             60.77 |         63.42 |
+|      qwen1.5-14b-chat-hf      |               74.81 |                   93.73 |                  90.91 |                   92.13 |                    96.89 |                     98.15 |                    89.62 |                               88.37 |   54.41 | 70.14 |                             69.86 |         69.13 |
+|      qwen1.5-32b-chat-hf      |               80.37 |                   94.49 |                  93.75 |                   94.94 |                    97.93 |                     97.22 |                    90.09 |                               90.70 |   68.63 | 78.73 |                             73.21 |         77.52 |
+|      qwen1.5-72b-chat-hf      |               84.07 |                   96.74 |                  95.45 |                   94.94 |                    97.93 |                     95.37 |                    92.92 |                               91.28 |   63.73 | 80.09 |                             73.68 |         83.89 |
+|     qwen1.5-110b-chat-hf      |               90.37 |                   96.99 |                  96.02 |                   95.51 |                    98.45 |                     98.15 |                    93.87 |                               94.19 |   81.37 | 86.88 |                             84.69 |         90.94 |
+|    internlm2-chat-1.8b-hf     |               48.15 |                   65.41 |                  69.32 |                   54.49 |                    79.27 |                     70.37 |                    60.85 |                               64.53 |   32.35 | 32.58 |                             45.45 |         40.60 |
+|  internlm2-chat-1.8b-sft-hf   |               48.15 |                   64.91 |                  69.89 |                   53.93 |                    79.27 |                     70.37 |                    61.32 |                               63.95 |   33.82 | 29.86 |                             45.45 |         39.93 |
+|     internlm2-chat-7b-hf      |               66.67 |                   85.21 |                  73.30 |                   66.85 |                    91.19 |                     76.85 |                    70.28 |                               75.58 |   42.16 | 50.68 |                             60.77 |         70.47 |
+|   internlm2-chat-7b-sft-hf    |               67.04 |                   85.21 |                  73.86 |                   66.85 |                    90.67 |                     77.78 |                    71.70 |                               75.00 |   42.16 | 51.13 |                             60.29 |         72.15 |
+|     internlm2-chat-20b-hf     |               74.07 |                   85.96 |                  75.57 |                   77.53 |                    89.12 |                     76.85 |                    72.64 |                               83.72 |   51.96 | 56.11 |                             68.42 |         73.49 |
+|   internlm2-chat-20b-sft-hf   |               73.70 |                   85.46 |                  76.70 |                   78.09 |                    89.64 |                     76.85 |                    72.17 |                               84.88 |   50.00 | 56.56 |                             66.99 |         75.17 |
+|    llama-3-8b-instruct-hf     |               55.93 |                   67.42 |                  55.68 |                   55.06 |                    72.02 |                     62.04 |                    54.25 |                               66.86 |   44.12 | 40.72 |                             47.37 |         44.63 |
+|    llama-3-70b-instruct-hf    |               71.11 |                   84.21 |                  74.43 |                   73.03 |                    84.97 |                     80.56 |                    69.81 |                               78.49 |   57.35 | 50.68 |                             57.89 |         64.43 |
+| llama-3-8b-instruct-lmdeploy  |               54.81 |                   67.17 |                  58.52 |                   53.37 |                    72.54 |                     62.04 |                    57.08 |                               63.95 |   44.12 | 37.56 |                             46.89 |         42.62 |
+| llama-3-70b-instruct-lmdeploy |               70.37 |                   82.96 |                  72.16 |                   71.91 |                    83.94 |                     82.41 |                    69.34 |                               77.91 |   55.39 | 50.68 |                             56.46 |         64.09 |
+|  mistral-7b-instruct-v0.1-hf  |               39.63 |                   46.62 |                  33.52 |                   41.01 |                    56.48 |                     45.37 |                    36.32 |                               43.60 |   29.90 | 31.67 |                             39.71 |         31.88 |
+|  mistral-7b-instruct-v0.2-hf  |               46.30 |                   54.39 |                  39.20 |                   43.26 |                    61.66 |                     51.85 |                    35.38 |                               55.23 |   28.92 | 35.29 |                             37.80 |         29.19 |
+| mixtral-8x7b-instruct-v0.1-hf |               58.52 |                   66.17 |                  56.82 |                   57.30 |                    66.32 |                     62.04 |                    48.11 |                               66.28 |   41.67 | 37.10 |                             46.41 |         35.91 |
+|             model             |   professional_tour_guide |   legal_professional |   high_school_chinese |   high_school_history |   middle_school_history |   civil_servant |   sports_science |   plant_protection |   basic_medicine |   clinical_medicine |   urban_and_rural_planner |   accountant |
+|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |                     36.47 |                39.07 |                 27.53 |                 41.76 |                   45.89 |           39.63 |            35.56 |              31.66 |            37.71 |               34.00 |                     32.78 |        37.25 |
+|     qwen1.5-1.8b-chat-hf      |                     56.02 |                45.58 |                 39.33 |                 67.03 |                   84.54 |           49.42 |            48.89 |              51.76 |            47.43 |               50.50 |                     45.69 |        52.14 |
+|      qwen1.5-4b-chat-hf       |                     61.28 |                52.56 |                 42.70 |                 73.08 |                   85.99 |           55.48 |            59.44 |              55.28 |            60.57 |               57.00 |                     50.00 |        58.01 |
+|      qwen1.5-7b-chat-hf       |                     73.31 |                56.28 |                 58.99 |                 82.97 |                   88.41 |           64.57 |            66.67 |              63.82 |            77.14 |               75.50 |                     57.42 |        69.07 |
+|      qwen1.5-14b-chat-hf      |                     80.83 |                65.12 |                 70.79 |                 89.56 |                   93.24 |           67.60 |            72.78 |              68.34 |            80.57 |               80.00 |                     61.72 |        75.62 |
+|      qwen1.5-32b-chat-hf      |                     87.59 |                72.56 |                 76.40 |                 90.66 |                   95.65 |           74.36 |            80.00 |              80.40 |            86.86 |               84.00 |                     74.88 |        85.33 |
+|      qwen1.5-72b-chat-hf      |                     90.98 |                76.28 |                 75.84 |                 90.66 |                   95.65 |           75.52 |            84.44 |              82.91 |            91.43 |               89.00 |                     73.92 |        85.10 |
+|     qwen1.5-110b-chat-hf      |                     95.11 |                88.37 |                 82.58 |                 91.76 |                   96.62 |           87.65 |            91.67 |              90.95 |            93.71 |               95.00 |                     87.08 |        91.87 |
+|    internlm2-chat-1.8b-hf     |                     54.14 |                40.00 |                 27.53 |                 62.09 |                   70.53 |           44.99 |            41.67 |              51.76 |            45.71 |               39.00 |                     40.67 |        39.28 |
+|  internlm2-chat-1.8b-sft-hf   |                     54.14 |                42.33 |                 26.97 |                 61.54 |                   71.98 |           45.45 |            41.67 |              50.25 |            45.14 |               37.50 |                     41.39 |        40.63 |
+|     internlm2-chat-7b-hf      |                     70.68 |                44.19 |                 34.83 |                 73.63 |                   84.06 |           51.98 |            57.22 |              68.34 |            66.86 |               57.50 |                     54.55 |        50.11 |
+|   internlm2-chat-7b-sft-hf    |                     71.80 |                44.65 |                 37.64 |                 73.63 |                   84.06 |           51.98 |            57.78 |              67.84 |            65.71 |               60.50 |                     54.55 |        50.11 |
+|     internlm2-chat-20b-hf     |                     75.56 |                54.42 |                 42.13 |                 74.73 |                   85.51 |           57.34 |            65.56 |              67.84 |            73.71 |               64.00 |                     57.89 |        55.98 |
+|   internlm2-chat-20b-sft-hf   |                     76.32 |                55.35 |                 41.01 |                 75.27 |                   85.51 |           58.28 |            65.56 |              67.34 |            72.57 |               65.00 |                     58.37 |        56.43 |
+|    llama-3-8b-instruct-hf     |                     53.01 |                44.65 |                 33.15 |                 46.70 |                   66.18 |           45.22 |            58.89 |              61.81 |            62.86 |               57.50 |                     48.33 |        49.89 |
+|    llama-3-70b-instruct-hf    |                     71.43 |                50.70 |                 30.90 |                 71.43 |                   82.13 |           59.67 |            73.33 |              73.37 |            82.86 |               82.00 |                     59.09 |        62.08 |
+| llama-3-8b-instruct-lmdeploy  |                     51.13 |                45.12 |                 29.78 |                 43.96 |                   62.32 |           47.09 |            56.11 |              54.77 |            56.00 |               56.00 |                     49.04 |        47.40 |
+| llama-3-70b-instruct-lmdeploy |                     68.80 |                48.84 |                 30.90 |                 70.88 |                   81.64 |           58.28 |            72.22 |              70.85 |            80.00 |               81.00 |                     57.66 |        62.53 |
+|  mistral-7b-instruct-v0.1-hf  |                     30.45 |                35.81 |                 24.72 |                 40.11 |                   34.78 |           30.77 |            43.89 |              38.69 |            36.57 |               32.50 |                     44.74 |        34.09 |
+|  mistral-7b-instruct-v0.2-hf  |                     36.09 |                38.14 |                 23.03 |                 43.41 |                   45.41 |           35.90 |            50.00 |              41.71 |            42.86 |               36.00 |                     45.22 |        42.21 |
+| mixtral-8x7b-instruct-v0.1-hf |                     47.37 |                44.65 |                 30.34 |                 51.65 |                   60.87 |           42.19 |            53.89 |              58.29 |            52.00 |               47.00 |                     48.56 |        44.02 |
+|             model             |   fire_engineer |   environmental_impact_assessment_engineer |   tax_accountant |   physician |
+|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+|     qwen1.5-0.5b-chat-hf      |           27.66 |                                      38.43 |            32.28 |       35.44 |
+|     qwen1.5-1.8b-chat-hf      |           38.65 |                                      46.62 |            46.73 |       59.14 |
+|      qwen1.5-4b-chat-hf       |           49.29 |                                      54.80 |            51.02 |       70.20 |
+|      qwen1.5-7b-chat-hf       |           53.90 |                                      62.28 |            57.79 |       76.52 |
+|      qwen1.5-14b-chat-hf      |           58.87 |                                      65.12 |            67.27 |       86.68 |
+|      qwen1.5-32b-chat-hf      |           74.11 |                                      70.82 |            74.94 |       88.04 |
+|      qwen1.5-72b-chat-hf      |           74.82 |                                      75.09 |            78.56 |       89.39 |
+|     qwen1.5-110b-chat-hf      |           88.30 |                                      88.97 |            94.13 |       95.49 |
+|    internlm2-chat-1.8b-hf     |           30.14 |                                      41.99 |            34.54 |       46.73 |
+|  internlm2-chat-1.8b-sft-hf   |           30.14 |                                      43.06 |            34.31 |       47.86 |
+|     internlm2-chat-7b-hf      |           42.20 |                                      52.31 |            47.63 |       66.82 |
+|   internlm2-chat-7b-sft-hf    |           43.26 |                                      52.67 |            47.86 |       66.59 |
+|     internlm2-chat-20b-hf     |           45.74 |                                      54.80 |            51.02 |       69.07 |
+|   internlm2-chat-20b-sft-hf   |           45.74 |                                      55.16 |            51.02 |       68.62 |
+|    llama-3-8b-instruct-hf     |           37.59 |                                      50.53 |            42.44 |       68.40 |
+|    llama-3-70b-instruct-hf    |           50.71 |                                      64.06 |            55.53 |       84.42 |
+| llama-3-8b-instruct-lmdeploy  |           37.94 |                                      50.53 |            41.53 |       66.14 |
+| llama-3-70b-instruct-lmdeploy |           48.94 |                                      63.70 |            53.95 |       81.72 |
+|  mistral-7b-instruct-v0.1-hf  |           27.66 |                                      39.15 |            29.35 |       39.95 |
+|  mistral-7b-instruct-v0.2-hf  |           32.27 |                                      37.01 |            32.96 |       42.89 |
+| mixtral-8x7b-instruct-v0.1-hf |           36.88 |                                      48.75 |            41.76 |       53.05 |
+### Details on Dev Split
--- a/opencompass/configs/datasets/ceval/ceval_clean_ppl.py
+++ b/opencompass/configs/datasets/ceval/ceval_clean_ppl.py
+from typing import List
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
+from opencompass.datasets import CEvalDatasetClean as CEvalDataset
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+ceval_datasets = []
+for _split in ['val']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={
+                    answer: dict(
+                        begin='</E>',
+                        round=[
+                            dict(
+                                role='HUMAN',
+                                prompt=
+                                f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
+                            ),
+                            dict(role='BOT', prompt=answer),
+                        ])
+                    for answer in ['A', 'B', 'C', 'D']
+                },
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+        ceval_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), analyze_contamination=True)
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path='opencompass/ceval-exam',
+                name=_name,
+                abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split=_split),
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
+del _split, _name, _ch_name
--- a/opencompass/configs/datasets/ceval/ceval_gen.py
+++ b/opencompass/configs/datasets/ceval/ceval_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .ceval_gen_5f30c7 import ceval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ceval/ceval_gen_2daf24.py
+++ b/opencompass/configs/datasets/ceval/ceval_gen_2daf24.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+ceval_datasets = []
+for _split in ['val', 'test']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=
+                            f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
+                        ),
+                        dict(role='BOT', prompt='{answer}'),
+                    ]),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=GenInferencer),
+        )
+        ceval_eval_cfg = dict(
+            evaluator=dict(type=AccEvaluator),
+            pred_postprocessor=dict(type=first_capital_postprocess))
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path='opencompass/ceval-exam',
+                name=_name,
+                abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
+                _name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split=_split),
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
+del _split, _name, _ch_name
--- a/opencompass/configs/datasets/ceval/ceval_gen_5f30c7.py
+++ b/opencompass/configs/datasets/ceval/ceval_gen_5f30c7.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+ceval_datasets = []
+for _split in ['val']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=
+                            f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: '
+                        ),
+                        dict(role='BOT', prompt='{answer}'),
+                    ]),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=GenInferencer),
+        )
+        ceval_eval_cfg = dict(
+            evaluator=dict(type=AccEvaluator),
+            pred_postprocessor=dict(type=first_capital_postprocess))
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path='opencompass/ceval-exam',
+                name=_name,
+                abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' +
+                _name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split=_split),
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
+del _split, _name, _ch_name
--- a/opencompass/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py
+++ b/opencompass/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+ceval_datasets = []
+for _split in ['val', 'test']:
+    for _name in ceval_all_sets:
+        ceval_reader_cfg = dict(
+            input_columns=['question', 'A', 'B', 'C', 'D'],
+            output_column='answer',
+            train_split='dev',
+            test_split=_split,
+        )
+        _ch_name = ceval_subject_mapping[_name][1]
+        hint = f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。'
+        question_and_options = '{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={answer: f'{question_and_options}\n答案: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template={answer: f'{hint}\n</E>{question_and_options}\n答案: {answer}' for answer in ['A', 'B', 'C', 'D']},
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path='opencompass/ceval-exam',
+                name=_name,
+                abbr='ceval-' + _name if _split == 'val' else 'ceval-test-' + _name,
+                reader_cfg=ceval_reader_cfg,
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))