eval_ruler.py 4.5 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from mmengine.config import read_base

from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask

with read_base():
    from opencompass.configs.datasets.ruler.ruler_cwe_gen import \
        cwe_datasets  # CWE
    from opencompass.configs.datasets.ruler.ruler_fwe_gen import \
        fwe_datasets  # FWE
    from opencompass.configs.datasets.ruler.ruler_niah_gen import \
        niah_datasets  # Niah
    from opencompass.configs.datasets.ruler.ruler_qa_gen import \
        qa_datasets  # QA
    from opencompass.configs.datasets.ruler.ruler_vt_gen import \
        vt_datasets  # VT
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
        models as internlm2_5_7b_chat_1m
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as llama3_8b_instruct_model
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
        models as qwen2_7b_instruct_model
    from opencompass.configs.summarizers.groups.ruler import \
        ruler_summary_groups

import_datasets = sum(
    [niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])

# Evaluation config
NUM_SAMPLES = 500
# Change the context lengths to be tested
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
abbr_suffixs = ['4k', '8k', '16k', '32k']
work_dir = './outputs/ruler'

# Model Settings
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
model_settings = [
    [qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
    [llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
    [internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
]

# Dataset Model Combination
datasets = []
models = []
model_dataset_combinations = []

# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
    for model, model_path in model_settings:
        _tmp_datasets = []
        for dataset in import_datasets:
            tmp_dataset = dataset.deepcopy()
            tmp_dataset['tokenizer_model'] = model_path
            tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
            tmp_dataset['num_samples'] = NUM_SAMPLES
            tmp_dataset['max_seq_length'] = max_seq_len
            _tmp_datasets.append(tmp_dataset)
        model_dataset_combinations.append(
            dict(models=[model], datasets=_tmp_datasets))
        models.append(model)
        datasets.extend(_tmp_datasets)

infer = dict(
    partitioner=dict(type=NumWorkerPartitioner),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=OpenICLInferTask),
                retry=5),
)

eval = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(type=LocalRunner,
                max_num_workers=32,
                task=dict(type=OpenICLEvalTask)),
)

summarizer = dict(
    dataset_abbrs=abbr_suffixs,
    summary_groups=sum([ruler_summary_groups], []),
)

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind    internlm2_5-7b-chat-1m-turbomind
# ---------  ---------  -------------  ------  -----------------------------  -------------------------------  ----------------------------------
# 4k         -          naive_average  gen                             93.66                            93.48                               91.20
# 8k         -          naive_average  gen                             88.38                            89.95                               89.07
# 16k        -          naive_average  gen                             84.27                             0.14                               87.61
# 32k        -          naive_average  gen                             81.36                             0.00                               84.59
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$