Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_57b0b1 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_943d32 import math_datasets
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
models = [
dict(abbr='gpt-3.5-react',
type=CodeAgent,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
batch_size=8),
dict(abbr='WizardCoder-Python-13B-V1.0-react',
type=CodeAgent,
llm=dict(
type=HuggingFaceCausalLM,
path='WizardLM/WizardCoder-Python-13B-V1.0',
tokenizer_path='WizardLM/WizardCoder-Python-13B-V1.0',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_seq_len=2048,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
),
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1)),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# This config is used to test all the code benchmarks
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
# Datasets Part
# bigcodebench
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
bigcodebench_full_instruct_datasets
)
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
bigcodebench_hard_instruct_datasets
)
# livecodebench code generation lite v5
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
LCB_datasets
)
# huamneval series
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
humaneval_datasets
)
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
humanevalpro_datasets
)
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
humanevalx_datasets
)
from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
humaneval_plus_datasets
)
# mbpp series
from opencompass.configs.datasets.mbpp.mbpp_gen import (
mbpp_datasets
)
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
mbpppro_datasets
)
# multipl-e
from opencompass.configs.datasets.multipl_e.multiple_gen import (
multiple_datasets
)
# ds1000
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
ds1000_datasets
)
# Models Part
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
# Summary Groups
from opencompass.configs.summarizers.groups.ds1000 import (
ds1000_summary_groups,
)
from opencompass.configs.summarizers.groups.multipl_e import (
multiple_summary_groups,
)
from opencompass.configs.summarizers.groups.humanevalx import (
humanevalx_summary_groups,
)
# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 16384
model['max_out_len'] = 8192
# datasets config
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
for item in humanevalx_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/humanevalx'
item['eval_cfg']['evaluator']['port'] = ''
for item in ds1000_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/ds1000'
item['eval_cfg']['evaluator']['port'] = ''
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
# summary
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.append(
{'name': 'humanevalx',
'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
)
summarizer = dict(
dataset_abbrs = [
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_full_instruct', 'pass@1'],
['lcb_code_generation', 'pass@1'],
['openai_humaneval', 'humaneval_pass@1'],
['mbpp', 'score'],
['humaneval_pro', 'pass@1'],
['mbpp_pro', 'pass@1'],
['humaneval_plus', 'humaneval_plus_pass@1'],
['multiple', 'naive_average'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
'',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-java',
'humanevalx-js',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'humaneval-multiple-cpp',
'humaneval-multiple-cs',
'humaneval-multiple-go',
'humaneval-multiple-java',
'humaneval-multiple-rb',
'humaneval-multiple-js',
'humaneval-multiple-php',
'humaneval-multiple-r',
'humaneval-multiple-rs',
'humaneval-multiple-sh',
'',
'mbpp-multiple-cpp',
'mbpp-multiple-cs',
'mbpp-multiple-go',
'mbpp-multiple-java',
'mbpp-multiple-rb',
'mbpp-multiple-js',
'mbpp-multiple-php',
'mbpp-multiple-r',
'mbpp-multiple-rs',
'mbpp-multiple-sh'
],
summary_groups=summary_groups,
)
work_dir = 'outputs/code'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.humanevalx.humanevalx_gen import \
humanevalx_datasets
from opencompass.configs.models.codegeex2.hf_codegeex2_6b import models
datasets = humanevalx_datasets
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_judge import compassarena_subjectivebench_singleturn_datasets
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_judge import compassarena_subjectivebench_multiturn_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import models as lmdeploy_internlm2_5_20b_chat
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import models as lmdeploy_qwen2_5_0_5b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import models as lmdeploy_qwen2_5_1_5b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import models as lmdeploy_qwen2_5_3b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import models as lmdeploy_qwen2_5_14b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import models as lmdeploy_qwen2_5_32b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
# models = [
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='CompassJudger-1-7B-Instruct',
# path='opencompass/CompassJudger-1-7B-Instruct',
# engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
# gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
# max_seq_len=16384,
# max_out_len=2048,
# batch_size=16,
# run_cfg=dict(num_gpus=1),
# )
# ]
models = [
*lmdeploy_qwen2_5_14b_instruct, *lmdeploy_qwen2_5_32b_instruct,
*lmdeploy_qwen2_5_7b_instruct, *lmdeploy_qwen2_7b_instruct
]
datasets = [
*compassarena_subjectivebench_singleturn_datasets,
*compassarena_subjectivebench_multiturn_datasets
] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=DefaultSubjectiveSummarizer, )
work_dir = 'outputs/subjective/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_bt_judge import (
compassarena_subjectivebench_bradleyterry_singleturn_datasets, )
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_bt_judge import (
compassarena_subjectivebench_bradleyterry_multiturn_datasets, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat, )
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
models as lmdeploy_llama3_1_8b_instruct, )
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import (
models as lmdeploy_llama3_1_70b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import (
models as lmdeploy_qwen2_5_0_5b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import (
models as lmdeploy_qwen2_5_1_5b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import (
models as lmdeploy_qwen2_5_3b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
models as lmdeploy_qwen2_5_32b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import (
models as lmdeploy_qwen2_5_72b_instruct, )
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct, )
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import CompassArenaBradleyTerrySummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
models = [
*lmdeploy_qwen2_5_14b_instruct,
*lmdeploy_qwen2_5_32b_instruct,
*lmdeploy_qwen2_5_7b_instruct,
*lmdeploy_qwen2_7b_instruct,
]
datasets = [
*compassarena_subjectivebench_bradleyterry_singleturn_datasets,
*compassarena_subjectivebench_bradleyterry_multiturn_datasets,
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
## ------------- Summary Configuration
# This step fits a Bradley-Terry model (statistical model) with an option
# to include style features and control variables based on groups
# (group variables must be available in the input dataset for each observation).
summarizer = dict(
type=CompassArenaBradleyTerrySummarizer,
rating_system='bradleyterry',
report_pred_win_rates=True,
num_bootstrap=100,
num_cpu=None,
with_control_vars=True,
normalize_style_features=False,
odds_ratio=True,
groups=['difficulty', 'category'],
)
work_dir = 'outputs/compassarena_subjectivebench_bradleyterry/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
ARC_c_datasets
from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
ceval_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
hellaswag_datasets
from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b_model
from opencompass.configs.models.yi.hf_yi_6b import models as hf_yi_6b_model
from opencompass.configs.summarizers.contamination import summarizer
datasets = [
*ceval_datasets, *mmlu_datasets, *hellaswag_datasets, *ARC_c_datasets
]
models = [*hf_yi_6b_model, *hf_qwen_7b_model, *hf_llama2_7b_model]
import os.path as osp
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
gpqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
humaneval_datasets
# ## Math
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets
# Model List
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups
# TODO: Add LiveCodeBench
# ## Instruction Following
# from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name':
'core_average',
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
['hellaswag', 'accuracy'], ['drop', 'accuracy'],
['math', 'accuracy'], ['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
['mathbench-t (average)', 'naive_average']],
},
]
summarizer = dict(
dataset_abbrs=[
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'naive_average'],
['hellaswag', 'accuracy'],
['drop', 'accuracy'],
['math', 'accuracy'],
['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math', 'accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science', 'accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'base_objective')
import os.path as osp
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
# TODO: Add LiveCodeBench
# ## Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets
# ## Math
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name':
'core_average',
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'], ['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'], ['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'], ['hellaswag', 'accuracy'],
['mathbench-t (average)', 'naive_average']],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'],
['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'],
['hellaswag', 'accuracy'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math', 'accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science', 'accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
'',
['bbh', 'extract_rate'],
['math', 'extract_rate'],
# ['openai_humaneval', 'extract_rate'],
['GPQA_diamond', 'extract_rate'],
# ['IFEval', 'extract_rate'],
'',
['mmlu', 'extract_rate'],
['mmlu-stem', 'extract_rate'],
['mmlu-social-science', 'extract_rate'],
['mmlu-humanities', 'extract_rate'],
['mmlu-other', 'extract_rate'],
'',
['mmlu_pro', 'extract_rate'],
['mmlu_pro_math', 'extract_rate'],
['mmlu_pro_physics', 'extract_rate'],
['mmlu_pro_chemistry', 'extract_rate'],
['mmlu_pro_law', 'extract_rate'],
['mmlu_pro_engineering', 'extract_rate'],
['mmlu_pro_other', 'extract_rate'],
['mmlu_pro_economics', 'extract_rate'],
['mmlu_pro_health', 'extract_rate'],
['mmlu_pro_psychology', 'extract_rate'],
['mmlu_pro_business', 'extract_rate'],
['mmlu_pro_biology', 'extract_rate'],
['mmlu_pro_philosophy', 'extract_rate'],
['mmlu_pro_computer_science', 'extract_rate'],
['mmlu_pro_history', 'extract_rate'],
'',
['cmmlu', 'extract_rate'],
['cmmlu-stem', 'extract_rate'],
['cmmlu-social-science', 'extract_rate'],
['cmmlu-humanities', 'extract_rate'],
['cmmlu-other', 'extract_rate'],
['cmmlu-china-specific', 'extract_rate'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'chat_objective')
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
needlebench_datasets as needlebench_8k_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
needlebench_datasets as needlebench_32k_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
needlebench_datasets as needlebench_128k_datasets
from opencompass.configs.datasets.ruler.ruler_8k_gen import \
ruler_datasets as ruler_8k_datasets
from opencompass.configs.datasets.ruler.ruler_32k_gen import \
ruler_datasets as ruler_32k_datasets
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
ruler_datasets as ruler_128k_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_1m_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as llama3_1_8b_instruct_model
# Instruct models
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model
# Summary Groups
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
from opencompass.configs.summarizers.needlebench import (
needlebench_8k_summarizer, needlebench_32k_summarizer,
needlebench_128k_summarizer)
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
# Instruct models summarizer
summarizer = dict(
dataset_abbrs=[
['ruler_8k', 'naive_average'],
['ruler_32k', 'naive_average'],
['ruler_128k', 'naive_average'],
['NeedleBench-Overall-Score-8K', 'weighted_average'],
['NeedleBench-Overall-Score-32K', 'weighted_average'],
['NeedleBench-Overall-Score-128K', 'weighted_average'],
['longbench', 'naive_average'],
['longbench_zh', 'naive_average'],
['longbench_en', 'naive_average'],
'',
'longbench_single-document-qa',
'longbench_multi-document-qa',
'longbench_summarization',
'longbench_few-shot-learning',
'longbench_synthetic-tasks',
'longbench_code-completion',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'long_context')
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.models.openai_api import OpenAI, OpenAISDK
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
mtbench_datasets
# Summarizer
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
#######################################################################
# PART 3 Models List #
#######################################################################
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2_5-7b-chat-turbomind',
path='internlm/internlm2_5-7b-chat',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=40,
temperature=1.0,
top_p=0.9,
max_new_tokens=4096),
max_seq_len=16384,
max_out_len=4096,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]
models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# JudgeLLM
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
type=OpenAISDK,
abbr='gpt-4o-2024-08-06',
path='gpt-4o-2024-08-06',
# openai_api_base=
# 'http://10.140.1.86:10001/v1', # Change to your own url if needed.
key='YOUR_API_KEY',
retry=10,
meta_template=api_meta_template,
rpm_verbose=True,
query_per_second=1,
max_out_len=4096,
max_seq_len=16384,
batch_size=16,
temperature=0.01,
tokenizer_path='gpt-4o-2024-08-06')
]
# Evaluation with local runner
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'chat_subjective')
# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard
import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
TurboMindModelwithChatTemplate,
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
with read_base():
# You can comment out the datasets you don't want to evaluate
# Datasets
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
# Summarizer
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# Set LLM Verifier used for each dataset
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
key='sk-1234', # You need to set your own API key
openai_api_base=[
'http://172.30.56.1:4000/v1', # You need to set your own API base
],
meta_template=dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
# max_seq_len=32768,
max_seq_len=49152,
)
for item in datasets:
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
#######################################################################
# PART 2 Model List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models += [
# You can comment out the models you don't want to evaluate
# All models use sampling mode
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=32768),
# max_seq_len=32768,
# max_out_len=32768,
# batch_size=128,
# run_cfg=dict(num_gpus=2),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=16384),
# max_seq_len=32768,
# max_out_len=16384,
# batch_size=128,
# run_cfg=dict(num_gpus=4),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
]
#######################################################################
# PART 3 Inference/Evaluation #
#######################################################################
# Inference configuration
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=1
# Similar with data-parallelism, how many workers for evaluation,
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
# to max-utilize the GPUs.
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask)
),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner, n=8
),
runner=dict(
type=LocalRunner,
task=dict(
type=OpenICLEvalTask)
),
)
#######################################################################
# PART 4 Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend([
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
{
'name': 'LiveMathBench-v202412-Hard-Aveage8',
'subsets':[[
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
]
}
])
# Summarizer
summarizer = dict(
dataset_abbrs=[
'MATH',
# ['LiveMathBench-k1-n1', 'pass@1'],
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
# ['aime2024', 'accuracy'],
['math_prm800k_500-llmjudge', 'accuracy'],
['AIME2024-Aveage8', 'naive_average'],
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
['OlympiadBenchMath', 'accuracy'],
['OmniMath', 'accuracy'],
],
summary_groups=summary_groups,
)
#######################################################################
# PART 5 Utils #
#######################################################################
work_dir = 'outputs/deepseek_r1_reasoning'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.dingo.dingo_gen import datasets
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
work_dir = './outputs/eval_dingo'
from mmengine.config import read_base
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.models import OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
PYTHON_INTERPRETER_DESCRIPTION = """\
It can run a Python code. The code must be a valid code that contains only python method.
"""
actions = [
dict(
type=PythonInterpreter,
description=PYTHON_INTERPRETER_DESCRIPTION,
answer_expr=None,
)
]
with read_base():
from opencompass.configs.datasets.ds1000.ds1000_gen_5c4bec import \
ds1000_datasets as datasets
models = [
dict(abbr='gpt-3.5-react',
type=CodeAgent,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=actions,
batch_size=8),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
# datasets
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \
commonsenseqa_datasets
from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \
chid_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen import \
humaneval_datasets
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets
from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \
truthfulqa_datasets
# models
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model
from opencompass.configs.models.others.hf_phi_2 import \
models as hf_phi_2_model
from opencompass.configs.models.qwen.hf_qwen2_7b import \
models as hf_qwen2_7b_model
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/edgellm/'
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode phi-2_hf
# ------------------------------------------- --------- ---------------- ------ ----------
# commonsense_qa c946f2 accuracy gen 65.19
# openai_humaneval 8e312c humaneval_pass@1 gen 30.49
# truthful_qa 5ddc62 rouge_max gen 0.08
# truthful_qa 5ddc62 rouge_diff gen -0.00
# truthful_qa 5ddc62 rouge_acc gen 0.41
# gsm8k 1d7fe4 accuracy gen 62.40
# chid-dev 211ee7 accuracy gen 12.87
# chid-test 211ee7 accuracy gen 14.34
# bbh - naive_average gen 59.50
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode Meta-Llama-3-8B_hf
# ------------------------------------------- --------- ---------------- ------ --------------------
# commonsense_qa c946f2 accuracy gen 70.11
# openai_humaneval 8e312c humaneval_pass@1 gen 26.22
# truthful_qa 5ddc62 rouge_max gen 0.07
# truthful_qa 5ddc62 rouge_diff gen -0.01
# truthful_qa 5ddc62 rouge_acc gen 0.41
# gsm8k 1d7fe4 accuracy gen 55.80
# chid-dev 211ee7 accuracy gen 40.59
# chid-test 211ee7 accuracy gen 36.66
# bbh - naive_average gen 61.62
# 20240816_060452
# tabulate format
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode qwen2-7b-hf
# -------------- --------- ---------- ------ -------------
# commonsense_qa 734a22 accuracy gen 65.19
# truthful_qa 5ddc62 rouge_max gen 0.08
# truthful_qa 5ddc62 rouge_diff gen -0.02
# truthful_qa 5ddc62 rouge_acc gen 0.44
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.eese.eese_judge_gen import \
eese_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
models as gpt4
from opencompass.models import OpenAISDK
# 配置评判模型
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
judge_cfg = dict(
abbr='model-judge',
type=OpenAISDK,
path='model-name',
key='your-api-key',
openai_api_base=['openai-url'],
meta_template=api_meta_template,
query_per_second=16,
batch_size=1,
temperature=0.001,
tokenizer_path='gpt-4o',
verbose=True,
max_out_len=16384,
max_seq_len=49152,
)
datasets = eese_datasets
models = gpt4
# 为每个数据集增加judge_cfg信息,而不是覆盖
for dataset in datasets:
if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
# 获取现有的judge_cfg,如果不存在则创建空字典
existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
# 更新现有的judge_cfg,保留原有配置并添加新配置
existing_judge_cfg.update(judge_cfg)
# 将更新后的配置设置回去
dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg
from mmengine.config import read_base
from opencompass.models import OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.chat_medium import datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
models = [
dict(
abbr='GPT-3.5-turbo-0613',
type=OpenAI,
path='gpt-3.5-turbo-0613',
key=
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=8),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.models import OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
# GPT4 needs a special humaneval postprocessor
from opencompass.datasets.humaneval import humaneval_gpt_postprocess
for _dataset in datasets:
if _dataset['path'] == 'openai_humaneval':
_dataset['eval_cfg']['pred_postprocessor'][
'type'] = humaneval_gpt_postprocess
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
models = [
dict(
abbr='GPT4',
type=OpenAI,
path='gpt-4-0613',
key=
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=2048,
batch_size=8),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.hellobench.hellobench import hellobench_datasets
from opencompass.models import HuggingFacewithChatTemplate, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
# make sure your models' generation parameters are set properly, for example, if you set temperature=0.8, make sure you set all models' temperature to 0.8
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='glm-4-9b-chat-hf',
path='THUDM/glm-4-9b-chat',
max_out_len=16384,
generation_kwargs=dict(
temperature=0.8,
do_sample=
True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
batch_size=1,
run_cfg=dict(num_gpus=2, num_procs=1),
stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
)
]
datasets = [*hellobench_datasets] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
# ------------- JudgeLLM Configuration
# we recommand to use gpt4o-mini as the judge model
# if you want to use open-source LLMs as judge models, you can uncomment the following code
# judge_models = [
# dict(
# type=HuggingFacewithChatTemplate,
# abbr='glm-4-9b-chat-hf',
# path='THUDM/glm-4-9b-chat',
# max_out_len=16384,
# generation_kwargs=dict(
# temperature=0.8,
# do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
# ),
# model_kwargs=dict(
# device_map='auto',
# trust_remote_code=True,
# ),
# batch_size=1,
# run_cfg=dict(num_gpus=2, num_procs=1),
# stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
# )
# ]
judge_models = [
dict(
abbr='GPT4o',
type=OpenAI,
path='gpt-4o',
key=
'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=4096,
batch_size=1,
temperature=0.8,
seed=42,
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=DefaultSubjectiveSummarizer)
work_dir = 'outputs/hellobench/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_mixed_713d14 import \
agieval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_3309bd import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_a6e128 import \
hellaswag_datasets
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.datasets.nq.nq_open_gen_e93f8a import nq_datasets
from opencompass.configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import \
BoolQ_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_gen_d18bf4 import \
triviaqa_datasets
from opencompass.configs.datasets.winogrande.winogrande_ll_c5cf57 import \
winogrande_datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b import models
from opencompass.configs.summarizers.example import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
work_dir = './outputs/llama2/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import (
piqa_datasets, siqa_datasets)
from opencompass.configs.models.hf_llama.hf_llama_7b import models
datasets = [*piqa_datasets, *siqa_datasets]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment