Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
# export DATASET_SOURCE='ModelScope' # before run this script
from datasets import Dataset, DatasetDict
from mmengine.config import read_base
from tqdm import tqdm
with read_base():
from opencompass.configs.datasets.agieval.agieval_gen import \
agieval_datasets as agieval_v2_datasets # ok
from opencompass.configs.datasets.agieval.agieval_gen_a0c741 import \
agieval_datasets as agieval_v1_datasets # ok
from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
ARC_c_datasets as ARC_c_clean_datasets # ok
from opencompass.configs.datasets.ARC_c.ARC_c_gen import \
ARC_c_datasets # ok
from opencompass.configs.datasets.ARC_e.ARC_e_gen import \
ARC_e_datasets # ok
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
ceval_datasets as ceval_clean_datasets # ok
from opencompass.configs.datasets.ceval.ceval_gen import \
ceval_datasets # ok
from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen import \
afqmc_datasets # ok
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import \
cmnli_datasets # ok
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import \
cmnli_datasets as cmnli_ppl_datasets # ok
from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import \
ocnli_datasets # ok
from opencompass.configs.datasets.cmmlu.cmmlu_gen import \
cmmlu_datasets # ok
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen import \
commonsenseqa_datasets # 额外处理gpt
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen import \
GaokaoBench_datasets # ok
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_mixed import \
GaokaoBench_datasets as GaokaoBench_mixed_datasets # ok
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets as GaokaoBench_no_subjective_datasets # ok
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets as hellaswag_ice_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
hellaswag_datasets as hellaswag_clean_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_gen import \
hellaswag_datasets as hellaswag_v2_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import \
hellaswag_datasets as hellaswag_v1_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_a6e128 import \
hellaswag_datasets as hellaswag_v3_datasets # ok
from opencompass.configs.datasets.humaneval.humaneval_gen import \
humaneval_datasets # ok
from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
humaneval_datasets as humaneval_repeat10_datasets # ok
from opencompass.configs.datasets.lambada.lambada_gen import \
lambada_datasets # ok
from opencompass.configs.datasets.lcsts.lcsts_gen import \
lcsts_datasets # ok
from opencompass.configs.datasets.math.math_gen import math_datasets # ok
from opencompass.configs.datasets.mbpp.mbpp_gen import \
mbpp_datasets as mbpp_v1_datasets # ok
from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import \
mbpp_datasets as mbpp_v2_datasets # ok
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \
sanitized_mbpp_datasets # ok
from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import \
mmlu_datasets as mmlu_clean_datasets # ok
from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets # ok
from opencompass.configs.datasets.nq.nq_gen import nq_datasets # ok
from opencompass.configs.datasets.obqa.obqa_gen import obqa_datasets # ok
from opencompass.configs.datasets.obqa.obqa_ppl_6aac9e import \
obqa_datasets as obqa_ppl_datasets # ok
from opencompass.configs.datasets.piqa.piqa_gen import \
piqa_datasets as piqa_v2_datasets # ok
from opencompass.configs.datasets.piqa.piqa_ppl import \
piqa_datasets as piqa_v1_datasets # ok
from opencompass.configs.datasets.piqa.piqa_ppl_0cfff2 import \
piqa_datasets as piqa_v3_datasets # ok
from opencompass.configs.datasets.race.race_ppl import race_datasets # ok
from opencompass.configs.datasets.siqa.siqa_gen import \
siqa_datasets as siqa_v2_datasets # ok
from opencompass.configs.datasets.siqa.siqa_gen_18632c import \
siqa_datasets as siqa_v3_datasets # ok
from opencompass.configs.datasets.siqa.siqa_ppl_42bc6e import \
siqa_datasets as siqa_ppl_datasets # ok
from opencompass.configs.datasets.storycloze.storycloze_gen import \
storycloze_datasets # ok
from opencompass.configs.datasets.storycloze.storycloze_ppl import \
storycloze_datasets as storycloze_ppl_datasets # ok
from opencompass.configs.datasets.strategyqa.strategyqa_gen import \
strategyqa_datasets
from opencompass.configs.datasets.summedits.summedits_gen import \
summedits_datasets as summedits_v2_datasets # ok
from opencompass.configs.datasets.triviaqa.triviaqa_gen import \
triviaqa_datasets # ok
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets as triviaqa_wiki_1shot_datasets # ok
from opencompass.configs.datasets.tydiqa.tydiqa_gen import \
tydiqa_datasets # ok
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets as winogrande_5shot_ll_datasets # ok
from opencompass.configs.datasets.winogrande.winogrande_gen import \
winogrande_datasets
from opencompass.configs.datasets.winogrande.winogrande_ll import \
winogrande_datasets as winogrande_ll_datasets # ok
from opencompass.configs.datasets.Xsum.Xsum_gen import Xsum_datasets
from opencompass.configs.models.opt.hf_opt_125m import models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
for d in datasets:
d['reader_cfg'].update({'train_range': '[0:5]', 'test_range': '[0:5]'})
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
with read_base():
from opencompass.configs.datasets.winogrande.winogrande_gen_a027b6 import \
winogrande_datasets
datasets = [*winogrande_datasets]
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm-chat-7b-hf',
path='internlm/internlm-chat-7b',
tokenizer_path='internlm/internlm-chat-7b',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
_winogrande_all = [d['abbr'] for d in winogrande_datasets]
summarizer = dict(summary_groups=[
{
'name': 'winogrande',
'subsets': _winogrande_all
},
{
'name': 'winogrande_std',
'subsets': _winogrande_all,
'std': True
},
])
import os.path as osp
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
models as lmdeploy_glm4_9b_chat_model
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
models as lmdeploy_gemma_27b_it_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
models as lmdeploy_ministral_8b_instruct_2410_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
models as lmdeploy_qwen2_5_32b_instruct_model
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
models as lmdeploy_yi_1_5_9b_chat_model
from opencompass.configs.summarizers.groups.musr_average import summarizer
datasets = [*musr_datasets]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
base_exp_dir = 'outputs/musr/'
work_dir = osp.join(base_exp_dir, 'musr_eval')
from mmengine.config import read_base
with read_base():
# Evaluate needlebench_4k, adjust the configuration to use 8k, 32k, 128k, 200k, or 1000k if necessary.
# from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_4k import needlebench_datasets
# from opencompass.configs.summarizers.needlebench import needlebench_4k_summarizer as summarizer
# only eval original "needle in a haystack test" in needlebench_4k
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_single_4k import (
needlebench_en_datasets, needlebench_zh_datasets)
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as internlm2_chat_7b
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
models as internlm2_chat_7b_200k
from opencompass.configs.summarizers.needlebench import \
needlebench_4k_summarizer as summarizer
# eval Ancestral Tracing Challenge(ATC)
# from opencompass.configs.datasets.needlebench.atc.atc_choice_50 import needlebench_datasets
# from opencompass.configs.summarizers.needlebench import atc_summarizer_50 as summarizer
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
for m in internlm2_chat_7b:
m['max_seq_len'] = 32768 # Ensure InternLM2-7B model can receive the full length of long texts, adjust for other models based on their supported maximum sequence length.
m['max_out_len'] = 2000 # Ensure complete responses from the model in multi-needle retrieval tasks.
models = internlm2_chat_7b
work_dir = './outputs/needlebench'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.leaderboard.qwen import \
datasets
from opencompass.configs.models.qwen.hf_qwen_7b import models
from opencompass.configs.summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-hf
-------------------------------------- --------- ---------------- ------ ------------
--------- 考试 Exam --------- - - - -
ceval - naive_average ppl 58.65
agieval - naive_average mixed 40.49
mmlu - naive_average ppl 57.78
cmmlu - naive_average ppl 58.57
GaokaoBench - weighted_average mixed 51.76
ARC-c 72cf91 accuracy gen 83.73
ARC-e 72cf91 accuracy gen 90.65
--------- 语言 Language --------- - - - -
WiC ce62e6 accuracy ppl 51.10
chid-dev 25f3d3 accuracy ppl 86.63
afqmc-dev cc328c accuracy ppl 69.00
WSC 678cb5 accuracy ppl 63.46
tydiqa-goldp - naive_average gen 19.98
flores_100 - naive_average gen 3.20
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.00
commonsense_qa 0d8e25 accuracy ppl 67.49
triviaqa b6904f score gen 40.45
nq b6904f score gen 14.16
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 75.29
race-middle 73bdec accuracy ppl 90.53
race-high 73bdec accuracy ppl 87.71
openbookqa_fact fa871c accuracy gen 92.20
csl_dev 3c4211 accuracy ppl 56.25
lcsts 0b3969 rouge1 gen 12.38
Xsum 207e69 rouge1 gen 36.00
eprstmt-dev 101429 accuracy gen 89.38
lambada de1af2 accuracy gen 67.88
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 54.85
ocnli 1471e7 accuracy gen 42.34
AX_b 793c72 accuracy gen 58.61
AX_g c4c886 accuracy gen 69.10
RTE c4c886 accuracy gen 57.76
COPA 59f42c accuracy gen 88.00
ReCoRD 3e0689 score gen 27.78
hellaswag 06a1e2 accuracy gen 92.47
piqa 24369d accuracy gen 78.02
siqa ea30d1 accuracy ppl 75.03
math 2c0b9e accuracy gen 11.06
gsm8k 4c7f6e accuracy gen 50.87
drop 53a0a7 score gen 44.95
openai_humaneval dd0dff humaneval_pass@1 gen 23.78
mbpp 60ca11 score gen 31.20
bbh - naive_average gen 40.03
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
datasets
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
from opencompass.configs.summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-chat-hf
-------------------------------------- --------- ---------------- ------ -----------------
--------- 考试 Exam --------- - - - -
ceval - naive_average gen 56.07
agieval - naive_average mixed 39.51
mmlu - naive_average gen 53.49
cmmlu - naive_average gen 55.29
GaokaoBench - weighted_average gen 48.01
ARC-c ca1e8e accuracy ppl 74.92
ARC-e ca1e8e accuracy ppl 85.71
--------- 语言 Language --------- - - - -
WiC efbd01 accuracy gen 51.41
chid-dev 25f3d3 accuracy ppl 77.72
afqmc-dev 4a1636 accuracy gen 69.00
WSC 678cb5 accuracy ppl 67.31
tydiqa-goldp - naive_average gen 15.32
flores_100 - naive_average gen 10.00
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.18
commonsense_qa ddaabf accuracy gen 76.41
triviaqa b6904f score gen 43.25
nq 23dc1a score gen 16.26
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 81.53
race-middle e0908b accuracy gen 83.01
race-high e0908b accuracy gen 77.79
openbookqa_fact 49689a accuracy ppl 86.40
csl_dev 3c4211 accuracy ppl 64.38
lcsts 0b3969 rouge1 gen 12.75
Xsum 207e69 rouge1 gen 20.21
eprstmt-dev ed0c5d accuracy ppl 85.00
lambada de1af2 accuracy gen 59.19
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 48.08
ocnli 15e783 accuracy ppl 51.40
AX_b 689df1 accuracy ppl 65.67
AX_g 808a19 accuracy ppl 76.12
RTE 808a19 accuracy ppl 68.95
COPA 59f42c accuracy gen 92.00
ReCoRD 6f7cfc score gen 0.16
hellaswag 8d79e0 accuracy ppl 69.28
piqa 34eee7 accuracy ppl 72.20
siqa ea30d1 accuracy ppl 72.88
math 2c0b9e accuracy gen 7.84
gsm8k 4c7f6e accuracy gen 45.41
drop 53a0a7 score gen 39.62
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
mbpp 60ca11 score gen 20.60
bbh - naive_average gen 42.61
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.lawbench.lawbench_one_shot_gen_002588 import \
lawbench_datasets as lawbench_one_shot_datasets
from opencompass.configs.datasets.lawbench.lawbench_zero_shot_gen_002588 import \
lawbench_datasets as lawbench_zero_shot_datasets
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
from opencompass.configs.summarizers.lawbench import summarizer
datasets = lawbench_zero_shot_datasets + lawbench_one_shot_datasets
for d in datasets:
d['infer_cfg']['inferencer']['save_every'] = 1
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
with read_base():
from opencompass.configs.datasets.ruler.ruler_cwe_gen import \
cwe_datasets # CWE
from opencompass.configs.datasets.ruler.ruler_fwe_gen import \
fwe_datasets # FWE
from opencompass.configs.datasets.ruler.ruler_niah_gen import \
niah_datasets # Niah
from opencompass.configs.datasets.ruler.ruler_qa_gen import \
qa_datasets # QA
from opencompass.configs.datasets.ruler.ruler_vt_gen import \
vt_datasets # VT
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as internlm2_5_7b_chat_1m
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as llama3_8b_instruct_model
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as qwen2_7b_instruct_model
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
import_datasets = sum(
[niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])
# Evaluation config
NUM_SAMPLES = 500
# Change the context lengths to be tested
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
abbr_suffixs = ['4k', '8k', '16k', '32k']
work_dir = './outputs/ruler'
# Model Settings
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
model_settings = [
[qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
[llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
[internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
]
# Dataset Model Combination
datasets = []
models = []
model_dataset_combinations = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for model, model_path in model_settings:
_tmp_datasets = []
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['tokenizer_model'] = model_path
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
_tmp_datasets.append(tmp_dataset)
model_dataset_combinations.append(
dict(models=[model], datasets=_tmp_datasets))
models.append(model)
datasets.extend(_tmp_datasets)
infer = dict(
partitioner=dict(type=NumWorkerPartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
retry=5),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLEvalTask)),
)
summarizer = dict(
dataset_abbrs=abbr_suffixs,
summary_groups=sum([ruler_summary_groups], []),
)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
# --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
# 4k - naive_average gen 93.66 93.48 91.20
# 8k - naive_average gen 88.38 89.95 89.07
# 16k - naive_average gen 84.27 0.14 87.61
# 32k - naive_average gen 81.36 0.00 84.59
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
with read_base():
from opencompass.configs.datasets.ruler.ruler_combined_gen import \
ruler_combined_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as internlm2_5_7b_chat_1m
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = internlm2_5_7b_chat_1m
work_dir = './outputs/ruler'
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
retry=5),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLEvalTask)),
)
summarizer = dict(
dataset_abbrs=['ruler_4k', 'ruler_8k', 'ruler_16k', 'ruler_32k'],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import \
datasets
from opencompass.configs.models.rwkv.rwkv5_3b import models
from opencompass.configs.summarizers.leaderboard import summarizer
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.SimpleQA.simpleqa_gen import \
simpleqa_datasets
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
models as gpt_4o_2024_05_13_model
models = gpt_4o_2024_05_13_model # model for generation
judge_models = gpt_4o_2024_05_13_model # model for evaluation
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
summarizer = dict(type=DefaultSubjectiveSummarizer)
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=SubjectiveEvalTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
from opencompass.configs.datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
from opencompass.configs.datasets.subjective.fofo.fofo_judge import fofo_datasets
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=
True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [
*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets,
*compassarena_datasets, *compassbench_datasets, *fofo_datasets,
*mtbench_datasets, *mtbench101_datasets, *wildbench_datasets
] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key=
'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
work_dir = 'outputs/subjective/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3)
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import AlpacaSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
# To run this config, please ensure to successfully installed `alpaca-eval==0.6` and `scikit-learn==1.5`
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(do_sample=True, ),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*alpacav2]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
gpt4_judge = dict(
abbr='GPT4-Turbo',
path='gpt-4-1106-preview',
key=
'', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
config='weighted_alpaca_eval_gpt4_turbo')
## ------------- Evaluation Configuration
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
))
work_dir = 'outputs/alpaca/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
alpacav2_datasets, )
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_bradleyterry import (
arenahard_datasets, )
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
compassarena_datasets, )
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
wildbench_datasets, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct, )
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct, )
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import (CompassArenaBradleyTerrySummarizer,
SubjectiveSummarizer)
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
*lmdeploy_internlm2_5_7b_chat,
*lmdeploy_internlm2_5_20b_chat,
*lmdeploy_qwen2_5_14b_instruct,
*lmdeploy_qwen2_5_7b_instruct,
*lmdeploy_qwen2_7b_instruct,
]
datasets = [
*alpacav2_datasets,
*arenahard_datasets,
*compassarena_datasets,
*wildbench_datasets,
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
## ------------- Summary Configuration
# This step fits a Bradley-Terry model (statistical model) with an option
# to include style features and control variables based on groups
# (group variables must be available in the input dataset for each observation).
summarizer = dict(
type=CompassArenaBradleyTerrySummarizer,
rating_system='bradleyterry',
report_pred_win_rates=True,
num_bootstrap=100,
num_cpu=None,
with_control_vars=True,
normalize_style_features=False,
odds_ratio=True,
)
work_dir = 'outputs/subjective/bradleyterry'
from copy import deepcopy
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
teval_datasets as teval_en_datasets
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
teval_datasets as teval_zh_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as hf_internlm2_chat_7b_model
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
models as hf_llama2_7b_chat_model
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_model
from opencompass.configs.summarizers.teval import summarizer
meta_template_system_patches = {
'internlm2-chat-7b-hf':
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
'internlm2-chat-20b-hf':
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
}
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if all(r['role'].upper() != 'SYSTEM'
for r in round): # no system round
if m['abbr'] in meta_template_system_patches:
system_round = meta_template_system_patches[m['abbr']]
else:
system_round = [
r for r in round if r['role'].upper() == 'HUMAN'
][0]
system_round = deepcopy(system_round)
system_round['role'] = 'SYSTEM'
m['meta_template']['round'].append(system_round)
else:
raise ValueError(f'no meta_template.round in {m.get("abbr", None)}')
print(
f'model {m["abbr"]} is using the following meta_template: {m["meta_template"]}'
)
models.append(m)
datasets = teval_en_datasets + teval_zh_datasets
work_dir = './outputs/teval'
'''
dataset version metric mode qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf
------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
teval - naive_average unknown 57.69 78.18 36.63
teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
teval-instruct_v1 10482d json_metric unknown 94.32 97.08 0.15
teval-plan_str_v1 10482d f1_score unknown 66.24 84.12 45.72
teval-plan_json_v1 10482d f1_score unknown 63.62 77.71 19.95
teval-reason_str_v1 10482d thought unknown 54.14 63.58 44.92
teval-reason_retrieve_understand_json_v1 10482d thought unknown 33.77 54.72 21.49
teval-retrieve_str_v1 10482d name unknown 73.89 85.28 60.6
teval-reason_retrieve_understand_json_v1 10482d name unknown 31.15 68.97 15.34
teval-understand_str_v1 10482d args unknown 77.76 93.03 65.61
teval-reason_retrieve_understand_json_v1 10482d args unknown 44.16 72.23 26.84
teval-review_str_v1 10482d review_quality unknown 62.22 71.66 44.35
teval_zh - naive_average unknown 61.31 75.01 32.33
teval-instruct_v1_zh 10482d string_metric unknown 88.69 98.19 23.64
teval-instruct_v1_zh 10482d json_metric unknown 75.77 96.62 0.89
teval-plan_str_v1_zh 10482d f1_score unknown 62.43 70.69 47.82
teval-plan_json_v1_zh 10482d f1_score unknown 61.46 68.95 15.87
teval-reason_str_v1_zh 10482d thought unknown 59.43 68.14 46.96
teval-reason_retrieve_understand_json_v1_zh 10482d thought unknown 39.19 60.37 23.91
teval-retrieve_str_v1_zh 10482d name unknown 69.41 84.22 54.44
teval-reason_retrieve_understand_json_v1_zh 10482d name unknown 32.87 70.46 14.16
teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets as chat_ceval_datasets
from opencompass.configs.datasets.ceval.ceval_ppl_578f8d import \
ceval_datasets as base_ceval_datasets
from opencompass.configs.internal.clusters.slurm import eval, infer
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b_base_models
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_models
# from opencompass.configs.internal.clusters.slurm import infer_split as infer, eval
# from opencompass.configs.internal.clusters.slurm import infer_size as infer, eval
# from opencompass.configs.internal.clusters.slurm import infer_size_split as infer, eval
base_ceval_datasets = base_ceval_datasets[:1]
chat_ceval_datasets = chat_ceval_datasets[-1:]
# If you do not want to run all the combinations of models and datasets, you
# can specify the combinations you want to run here. This is useful when you
# deleberately want to skip some subset of the combinations.
# Models and datasets in different combinations are recommended to be disjoint
# (different `abbr` in model & dataset configs), as we haven't tested this case
# throughly.
model_dataset_combinations = [
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
]
# This union of models and datasets in model_dataset_combinations should be
# stored in the `models` and `datasets` variables below. Otherwise, modules
# like summarizer will miss out some information.
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
work_dir = './outputs/default/mdcomb/'
"""
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
---------------------- --------- -------- ------ ------------ -----------------
ceval-computer_network 9b9417 accuracy ppl 52.63 -
ceval-physician 6e277d accuracy gen - 59.18
"""
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ..summarizers.example import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
work_dir = './outputs/llama2-chat/'
llama_chat_meta_template = dict(
round=[
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
dict(role="BOT", begin=' ', end=' ', generate=True),
],
)
models = [
dict(
type=TurboMindModel,
abbr='llama-2-7b-chat-hf-lmdeploy',
path="Llama-2-7b-chat-hf",
meta_template=llama_chat_meta_template,
engine_config=dict(session_len=4096,
max_batch_size=32),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=1,
concurrency=1,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='[INST]',
)
]
\ No newline at end of file
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ..summarizers.example import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
work_dir = './outputs/llama2/'
models = [
dict(
type=TurboMindModel,
abbr='llama-2-7b-hf-lmdeploy',
path="Llama-2-7b-hf",
engine_config=dict(session_len=4096,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=1,
concurrency=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
]
\ No newline at end of file
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ..summarizers.example import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
work_dir = './outputs/qwen-chat/'
qwen_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=TurboMindModel,
abbr='qwen-7b-lmdeploy',
path="./Qwen-7B-chat",
meta_template=qwen_meta_template,
engine_config=dict(session_len=8192,
max_batch_size=32
),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=1,
concurrency=1,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='[INST]'
)
]
\ No newline at end of file
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ..summarizers.example import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
work_dir = './outputs/qwen/'
models = [
dict(
type=TurboMindModel,
abbr='qwen-7b-lmdeploy',
path="/Qwen-7B",
engine_config=dict(session_len=8192,
max_batch_size=32),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=1,
concurrency=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment