eval_mmlu_pro.py 2.94 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from mmengine.config import read_base

with read_base():
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
        mmlu_pro_datasets
    from opencompass.configs.internal.clusters.local import eval
    from opencompass.configs.internal.clusters.local import \
        infer_num_worker as infer
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
        models as lmdeploy_qwen2_7b_instruct_model
    from opencompass.configs.summarizers.mmlu_pro import summarizer

datasets = sum([
    v
    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

work_dir = 'outputs/debug/mmlu_pro'

# dataset                    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind
# -------------------------  ---------  -------------  ------  -----------------------------  -------------------------------
# mmlu_pro                   -          naive_average  gen                             46.18                            43.92
# mmlu_pro_biology           736233     accuracy       gen                             63.74                            64.02
# mmlu_pro_business          736233     accuracy       gen                             53.23                            46.01
# mmlu_pro_chemistry         736233     accuracy       gen                             35.25                            32.42
# mmlu_pro_computer_science  736233     accuracy       gen                             47.07                            44.88
# mmlu_pro_economics         736233     accuracy       gen                             59.00                            53.79
# mmlu_pro_engineering       736233     accuracy       gen                             26.73                            33.54
# mmlu_pro_health            736233     accuracy       gen                             47.31                            51.34
# mmlu_pro_history           736233     accuracy       gen                             42.78                            42.26
# mmlu_pro_law               736233     accuracy       gen                             28.07                            26.98
# mmlu_pro_math              736233     accuracy       gen                             53.59                            37.53
# mmlu_pro_philosophy        736233     accuracy       gen                             42.28                            42.48
# mmlu_pro_physics           736233     accuracy       gen                             39.11                            33.64
# mmlu_pro_psychology        736233     accuracy       gen                             60.90                            59.65
# mmlu_pro_other             736233     accuracy       gen                             47.40                            46.32