Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.models.openai_api import OpenAI, OpenAISDK
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
mtbench_datasets
# Summarizer
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
#######################################################################
# PART 3 Models List #
#######################################################################
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2_5-7b-chat-turbomind',
path='internlm/internlm2_5-7b-chat',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=40,
temperature=1.0,
top_p=0.9,
max_new_tokens=4096),
max_seq_len=16384,
max_out_len=4096,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]
models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# JudgeLLM
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
type=OpenAISDK,
abbr='gpt-4o-2024-08-06',
path='gpt-4o-2024-08-06',
# openai_api_base=
# 'http://10.140.1.86:10001/v1', # Change to your own url if needed.
key='YOUR_API_KEY',
retry=10,
meta_template=api_meta_template,
rpm_verbose=True,
query_per_second=1,
max_out_len=4096,
max_seq_len=16384,
batch_size=16,
temperature=0.01,
tokenizer_path='gpt-4o-2024-08-06')
]
# Evaluation with local runner
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'chat_subjective')
# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard
import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
TurboMindModelwithChatTemplate,
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
with read_base():
# You can comment out the datasets you don't want to evaluate
# Datasets
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
# Summarizer
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# Set LLM Verifier used for each dataset
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
key='sk-1234', # You need to set your own API key
openai_api_base=[
'http://172.30.56.1:4000/v1', # You need to set your own API base
],
meta_template=dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
# max_seq_len=32768,
max_seq_len=49152,
)
for item in datasets:
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
#######################################################################
# PART 2 Model List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models += [
# You can comment out the models you don't want to evaluate
# All models use sampling mode
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=32768),
# max_seq_len=32768,
# max_out_len=32768,
# batch_size=128,
# run_cfg=dict(num_gpus=2),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=16384),
# max_seq_len=32768,
# max_out_len=16384,
# batch_size=128,
# run_cfg=dict(num_gpus=4),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
]
#######################################################################
# PART 3 Inference/Evaluation #
#######################################################################
# Inference configuration
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=1
# Similar with data-parallelism, how many workers for evaluation,
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
# to max-utilize the GPUs.
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask)
),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner, n=8
),
runner=dict(
type=LocalRunner,
task=dict(
type=OpenICLEvalTask)
),
)
#######################################################################
# PART 4 Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend([
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
{
'name': 'LiveMathBench-v202412-Hard-Aveage8',
'subsets':[[
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
]
}
])
# Summarizer
summarizer = dict(
dataset_abbrs=[
'MATH',
# ['LiveMathBench-k1-n1', 'pass@1'],
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
# ['aime2024', 'accuracy'],
['math_prm800k_500-llmjudge', 'accuracy'],
['AIME2024-Aveage8', 'naive_average'],
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
['OlympiadBenchMath', 'accuracy'],
['OmniMath', 'accuracy'],
],
summary_groups=summary_groups,
)
#######################################################################
# PART 5 Utils #
#######################################################################
work_dir = 'outputs/deepseek_r1_reasoning'
from mmengine.config import read_base
with read_base():
from .datasets.dingo.dingo_gen import datasets
from .models.hf_internlm.hf_internlm_7b import models
work_dir = './outputs/eval_dingo'
from mmengine.config import read_base
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.models import OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
PYTHON_INTERPRETER_DESCRIPTION = """\
It can run a Python code. The code must be a valid code that contains only python method.
"""
actions = [
dict(
type=PythonInterpreter,
description=PYTHON_INTERPRETER_DESCRIPTION,
answer_expr=None,
)
]
with read_base():
from opencompass.configs.datasets.ds1000.ds1000_gen_5c4bec import \
ds1000_datasets as datasets
models = [
dict(abbr='gpt-3.5-react',
type=CodeAgent,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=actions,
batch_size=8),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
# datasets
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \
commonsenseqa_datasets
from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \
chid_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen import \
humaneval_datasets
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets
from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \
truthfulqa_datasets
# models
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model
from opencompass.configs.models.others.hf_phi_2 import \
models as hf_phi_2_model
from opencompass.configs.models.qwen.hf_qwen2_7b import \
models as hf_qwen2_7b_model
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/edgellm/'
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode phi-2_hf
# ------------------------------------------- --------- ---------------- ------ ----------
# commonsense_qa c946f2 accuracy gen 65.19
# openai_humaneval 8e312c humaneval_pass@1 gen 30.49
# truthful_qa 5ddc62 rouge_max gen 0.08
# truthful_qa 5ddc62 rouge_diff gen -0.00
# truthful_qa 5ddc62 rouge_acc gen 0.41
# gsm8k 1d7fe4 accuracy gen 62.40
# chid-dev 211ee7 accuracy gen 12.87
# chid-test 211ee7 accuracy gen 14.34
# bbh - naive_average gen 59.50
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode Meta-Llama-3-8B_hf
# ------------------------------------------- --------- ---------------- ------ --------------------
# commonsense_qa c946f2 accuracy gen 70.11
# openai_humaneval 8e312c humaneval_pass@1 gen 26.22
# truthful_qa 5ddc62 rouge_max gen 0.07
# truthful_qa 5ddc62 rouge_diff gen -0.01
# truthful_qa 5ddc62 rouge_acc gen 0.41
# gsm8k 1d7fe4 accuracy gen 55.80
# chid-dev 211ee7 accuracy gen 40.59
# chid-test 211ee7 accuracy gen 36.66
# bbh - naive_average gen 61.62
# 20240816_060452
# tabulate format
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode qwen2-7b-hf
# -------------- --------- ---------- ------ -------------
# commonsense_qa 734a22 accuracy gen 65.19
# truthful_qa 5ddc62 rouge_max gen 0.08
# truthful_qa 5ddc62 rouge_diff gen -0.02
# truthful_qa 5ddc62 rouge_acc gen 0.44
from mmengine.config import read_base
from opencompass.models import OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.chat_medium import datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
models = [
dict(
abbr='GPT-3.5-turbo-0613',
type=OpenAI,
path='gpt-3.5-turbo-0613',
key=
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=8),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.models import OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.summarizers.medium import summarizer
# GPT4 needs a special humaneval postprocessor
from opencompass.datasets.humaneval import humaneval_gpt_postprocess
for _dataset in datasets:
if _dataset['path'] == 'openai_humaneval':
_dataset['eval_cfg']['pred_postprocessor'][
'type'] = humaneval_gpt_postprocess
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
models = [
dict(
abbr='GPT4',
type=OpenAI,
path='gpt-4-0613',
key=
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=2048,
batch_size=8),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.hellobench.hellobench import hellobench_datasets
from opencompass.models import HuggingFacewithChatTemplate, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
# make sure your models' generation parameters are set properly, for example, if you set temperature=0.8, make sure you set all models' temperature to 0.8
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='glm-4-9b-chat-hf',
path='THUDM/glm-4-9b-chat',
max_out_len=16384,
generation_kwargs=dict(
temperature=0.8,
do_sample=
True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
batch_size=1,
run_cfg=dict(num_gpus=2, num_procs=1),
stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
)
]
datasets = [*hellobench_datasets] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
# ------------- JudgeLLM Configuration
# we recommand to use gpt4o-mini as the judge model
# if you want to use open-source LLMs as judge models, you can uncomment the following code
# judge_models = [
# dict(
# type=HuggingFacewithChatTemplate,
# abbr='glm-4-9b-chat-hf',
# path='THUDM/glm-4-9b-chat',
# max_out_len=16384,
# generation_kwargs=dict(
# temperature=0.8,
# do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
# ),
# model_kwargs=dict(
# device_map='auto',
# trust_remote_code=True,
# ),
# batch_size=1,
# run_cfg=dict(num_gpus=2, num_procs=1),
# stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
# )
# ]
judge_models = [
dict(
abbr='GPT4o',
type=OpenAI,
path='gpt-4o',
key=
'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=4096,
batch_size=1,
temperature=0.8,
seed=42,
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=DefaultSubjectiveSummarizer)
work_dir = 'outputs/hellobench/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_mixed_713d14 import \
agieval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_3309bd import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_a6e128 import \
hellaswag_datasets
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.datasets.nq.nq_open_gen_e93f8a import nq_datasets
from opencompass.configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import \
BoolQ_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_gen_d18bf4 import \
triviaqa_datasets
from opencompass.configs.datasets.winogrande.winogrande_ll_c5cf57 import \
winogrande_datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b import models
from opencompass.configs.summarizers.example import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
work_dir = './outputs/llama2/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import (
piqa_datasets, siqa_datasets)
from opencompass.configs.models.hf_llama.hf_llama_7b import models
datasets = [*piqa_datasets, *siqa_datasets]
from mmengine.config import read_base
with read_base():
# Inference PPL datasets
from opencompass.configs.datasets.inference_ppl.inference_ppl import inference_ppl_datasets
# Model configs
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
# -------------Inference Stage ----------------------------------------
datasets = [*inference_ppl_datasets]
workdir = 'outputs/inference_ppl'
models = [
*qwen1_5_7b,
*qwen1_5_14b,
*llama2_7b,
*llama2_13b,
]
# Set custom batch_size and num_gpus for faster loss calculation
# Smaller batch_size should give more precise results, at the cost of worse efficiency
model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
for mdl in models:
mdl.update(model_cfg)
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=256, # Maximum concurrent evaluation task count
),
)
# -------------Evaluation Stage ----------------------------------------
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLEvalTask),
max_num_workers=256,
))
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.base_medium import datasets
# choose a model of interest
from opencompass.configs.models.internlm.internlm_7b import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
from copy import deepcopy
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import \
agieval_datasets
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.math.math_evaluatorv2_gen_cecb31 import \
math_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as hf_internlm2_chat_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
models as hf_internlm2_chat_20b_model
from opencompass.configs.summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-chat-keyset/'
_origin_datasets = sum(
[v for k, v in locals().items() if k.endswith('_datasets')], [])
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
_vanilla_datasets = [deepcopy(d) for d in _origin_datasets]
_vanilla_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if any(r['role'] == 'SYSTEM' for r in round):
new_round = [r for r in round if r['role'] != 'SYSTEM']
print(
f'WARNING: remove SYSTEM round in meta_template for {m.get("abbr", None)}'
)
m['meta_template']['round'] = new_round
_vanilla_models.append(m)
datasets = _vanilla_datasets
models = _vanilla_models
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_mixed_713d14 import \
agieval_datasets
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model
from opencompass.configs.summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-keyset/'
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# To run this example, you need to do the following steps:
# 1. Install latest opencompass
# 2. Start a local server with Qwen2.5-72B-Instruct as LLMJudge server (i.e. using vLLM or LMDeploy)
# 3. Change the judge_cfg openai_api_base to your corresponindg local server address
# 4. Start this evaluation by running 'opencompass eval_internlm3_math500_thinking.py'
from opencompass.models import VLLMwithChatTemplate, OpenAISDK
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.math.math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000 import (
math_datasets,
)
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
judge_cfg = dict(
abbr='qwen2-5-72b-instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-72B-Instruct',
key='YOUR_API_KEY',
openai_api_base=[
'http://172.30.56.81:23333/v1/', ### Change to your own server
],
meta_template=api_meta_template,
query_per_second=16,
batch_size=16,
temperature=0.001,
max_seq_len=32768,
max_completion_tokens=32768,
)
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# set max_out_len for inference
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 16384
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
reasoning_chat_template = """You are an expert mathematician with extensive experience in mathematical competitions. You approach problems through systematic thinking and rigorous reasoning. When solving problems, follow these thought processes:
## Deep Understanding
Take time to fully comprehend the problem before attempting a solution. Consider:
- What is the real question being asked?
- What are the given conditions and what do they tell us?
- Are there any special restrictions or assumptions?
- Which information is crucial and which is supplementary?
## Multi-angle Analysis
Before solving, conduct thorough analysis:
- What mathematical concepts and properties are involved?
- Can you recall similar classic problems or solution methods?
- Would diagrams or tables help visualize the problem?
- Are there special cases that need separate consideration?
## Systematic Thinking
Plan your solution path:
- Propose multiple possible approaches
- Analyze the feasibility and merits of each method
- Choose the most appropriate method and explain why
- Break complex problems into smaller, manageable steps
## Rigorous Proof
During the solution process:
- Provide solid justification for each step
- Include detailed proofs for key conclusions
- Pay attention to logical connections
- Be vigilant about potential oversights
## Repeated Verification
After completing your solution:
- Verify your results satisfy all conditions
- Check for overlooked special cases
- Consider if the solution can be optimized or simplified
- Review your reasoning process
Remember:
1. Take time to think thoroughly rather than rushing to an answer
2. Rigorously prove each key conclusion
3. Keep an open mind and try different approaches
4. Summarize valuable problem-solving methods
5. Maintain healthy skepticism and verify multiple times
Your response should reflect deep mathematical understanding and precise logical thinking, making your solution path and reasoning clear to others.
When you're ready, present your complete solution with:
- Clear problem understanding
- Detailed solution process
- Key insights
- Thorough verification
Focus on clear, logical progression of ideas and thorough explanation of your mathematical reasoning. Provide answers in the same language as the user asking the question, repeat the final answer using a '\\boxed{}' without any units, you have [[8192]] tokens to complete the answer.
"""
reasoning_meta_template = dict(
begin=dict(
role='SYSTEM', api_role='SYSTEM', prompt=reasoning_chat_template
),
round=[
dict(role='HUMAN', api_role='HUMAN'),
# XXX: all system roles are mapped to human in purpose
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(
type=VLLMwithChatTemplate,
abbr='internlm3-8b-instruct-vllm',
path='internlm/internlm3-8b-instruct',
model_kwargs=dict(tensor_parallel_size=1),
generation_kwargs=dict(do_sample=False), # greedy
max_seq_len=32768,
max_out_len=16384,
batch_size=16,
run_cfg=dict(num_gpus=1),
meta_template=reasoning_meta_template,
)
]
datasets = math_datasets
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.base_medium import datasets
# choose a model of interest
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
from mmengine.config import read_base
from opencompass.models.turbomind_api import TurboMindAPIModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
crowspairs_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
api_key='internlm-chat-20b', # api_key
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
api_key='interlm-chat-7b', # api_key
max_out_len=100,
max_seq_len=2048,
batch_size=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
models = [internlm_chat_20b]
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
crowspairs_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
internlm2_meta_template = dict(round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT',
begin='<|im_start|>assistant\n',
end='<|im_end|>\n',
generate=True),
],
eos_token_id=92542)
# config for internlm-chat-7b
internlm_chat_7b = dict(
type=TurboMindModel,
abbr='internlm-chat-7b-turbomind',
path='internlm/internlm-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
# config for internlm-chat-7b
internlm2_chat_7b = dict(type=TurboMindModel,
abbr='internlm2-chat-7b-turbomind',
path='internlm/internlm2-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
meta_template=internlm2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>')
# config for internlm-chat-20b
internlm_chat_20b = dict(
type=TurboMindModel,
abbr='internlm-chat-20b-turbomind',
path='internlm/internlm-chat-20b',
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
models = [internlm_chat_20b]
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import FlamesSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Inferen Stage ----------------------------------------
with read_base():
from opencompass.configs.datasets.flames.flames_gen import flames_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models
datasets = [*flames_datasets]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT',
begin='<|im_start|>assistant\n',
end='<|im_end|>\n',
generate=True),
], )
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-chat-7b-hf',
path='internlm/internlm2-chat-7b',
tokenizer_path='internlm/internlm2-chat-7b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
generation_kwargs={
'eos_token_id': [2, 92542],
'do_sample': True
},
batch_padding=True,
)
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration---------------------------------
internlm1_chat_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
judge_models = [
dict(
type=HuggingFaceCausalLM,
abbr='flames-scorer',
path='CaasiHUANG/flames-scorer',
tokenizer_path='CaasiHUANG/flames-scorer',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
generation_kwargs={'do_sample': True},
max_out_len=512,
max_seq_len=4096,
batch_size=8,
meta_template=internlm1_chat_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]
## ------------- Evaluation Configuration----------------
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='singlescore',
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=FlamesSummarizer, judge_type='general')
work_dir = 'outputs/flames/'
from mmengine.config import read_base
from opencompass.models.turbomind_api import TurboMindAPIModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1, num_procs=1),
)
models = [internlm_chat_20b]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment