Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
from mmengine.config import read_base
from opencompass.models.huggingface import HuggingFaceCausalLM
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_736506 import math_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_7b import \
models as internlm_math_chat_7b_models
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_20b import \
models as internlm_math_chat_20b_models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Eval Math and GSM8k for both Internlm-Math-Chat-7B and 20b
datasets = [*math_datasets, *gsm8k_datasets]
models = [*internlm_math_chat_7b_models, *internlm_math_chat_20b_models]
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# # config for internlm-7b model
internlm_7b = dict(
type=TurboMindModel,
abbr='internlm-7b-turbomind',
path='internlm/internlm-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for internlm-20b model
internlm_20b = dict(
type=TurboMindModel,
abbr='internlm-20b-turbomind',
path='internlm/internlm-20b',
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
models = [internlm_20b]
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-7B-Instruct',
path='opencompass/CompassJudger-1-7B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]
datasets = judgerbench_datasets
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner,
n=10,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
work_dir = 'outputs/judgerbench/'
from mmengine import read_base
with read_base():
from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
korbench_mixed_datasets as mixed_datasets
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets as zero_shot_datasets
from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
korbench_3shot_single_datasets as three_shot_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b
datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
models = hf_internlm2_5_7b
from mmengine.config import read_base
from opencompass.models import LightllmAPI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.summarizers.leaderboard import summarizer
datasets = [*humaneval_datasets]
'''
# Prompt template for InternLM2-Chat
# https://github.com/InternLM/InternLM/blob/main/chat/chat_format.md
_meta_template = dict(
begin='<|im_start|>system\nYou are InternLM2-Chat, a harmless AI assistant<|im_end|>\n',
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
]
)
'''
_meta_template = None
models = [
dict(
abbr='LightllmAPI',
type=LightllmAPI,
url='http://localhost:1030/generate',
meta_template=_meta_template,
batch_size=32,
max_workers_per_task=128,
rate_per_worker=1024,
retry=4,
generation_kwargs=dict(do_sample=False,
ignore_eos=False,
max_new_tokens=1024),
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLInferTask),
),
)
from mmengine.config import read_base
from opencompass.models import OpenAISDK
with read_base():
# 选择一个数据集列表
from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
livestembench_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as qwen2_5_7b_instruct_lmdeploy_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as qwen2_5_72b_instruct_lmdeploy_model
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = [
*qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
]
# Judge 模型配置
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
judge_cfg = dict(
abbr='qwen2-5-72b-instruct',
type=OpenAISDK,
path='YOUR_SERVER_MODEL_NAME', # 你的部署的模型名称
key='None',
openai_api_base=[
'http://localhost:23333/v1', # 你的模型部署的地址
],
meta_template=api_meta_template,
query_per_second=16,
batch_size=16,
temperature=0.001,
max_completion_tokens=32768,
)
for dataset in datasets:
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask),
),
)
work_dir = './outputs/livestembench'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import (
piqa_datasets, siqa_datasets)
from opencompass.configs.models.llama.llama2_7b import models
datasets = [*piqa_datasets, *siqa_datasets]
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.lveval.lveval import \
LVEval_datasets as datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models
from opencompass.configs.summarizers.lveval import summarizer
models[0]['path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
models[0][
'tokenizer_path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
models[0]['max_seq_len'] = 4096
models[0]['generation_kwargs'] = dict(do_sample=False)
models[0]['mode'] = 'mid' # truncate in the middle
from mmengine.config import read_base
with read_base():
from opencompass.configs.dataset_collections.chat_OC15 import datasets
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model
from opencompass.configs.summarizers.chat_OC15 import summarizer
work_dir = 'outputs/debug/llama3-instruct'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# dataset version metric mode llama-3-8b-instruct-hf
# -------------------- --------- ---------------------------- ------ ------------------------
# average - naive_average gen 55.64
# mmlu - naive_average gen 68.30
# cmmlu - naive_average gen 53.29
# ceval - naive_average gen 52.32
# GaokaoBench - weighted_average gen 45.91
# triviaqa_wiki_1shot eaf81e score gen 79.01
# nq_open_1shot 01cf41 score gen 30.25
# race-high 9a54b6 accuracy gen 81.22
# winogrande b36770 accuracy gen 66.46
# hellaswag e42710 accuracy gen 74.33
# bbh - naive_average gen 67.25
# gsm8k 1d7fe4 accuracy gen 79.08
# math 393424 accuracy gen 27.78
# TheoremQA 6f0af8 score gen 19.50
# openai_humaneval 8e312c humaneval_pass@1 gen 55.49
# sanitized_mbpp 830460 score gen 66.54
# GPQA_diamond 4baadb accuracy gen 25.76
# IFEval 3321a3 Prompt-level-strict-accuracy gen 67.84
# - - - -
# mmlu - naive_average gen 68.30
# mmlu-stem - naive_average gen 57.92
# mmlu-social-science - naive_average gen 77.83
# mmlu-humanities - naive_average gen 71.20
# mmlu-other - naive_average gen 71.79
# cmmlu - naive_average gen 53.29
# cmmlu-stem - naive_average gen 45.40
# cmmlu-social-science - naive_average gen 54.63
# cmmlu-humanities - naive_average gen 54.14
# cmmlu-other - naive_average gen 59.52
# cmmlu-china-specific - naive_average gen 49.33
# ceval - naive_average gen 52.32
# ceval-stem - naive_average gen 48.16
# ceval-social-science - naive_average gen 57.50
# ceval-humanities - naive_average gen 53.26
# ceval-other - naive_average gen 54.26
# ceval-hard - naive_average gen 35.59
from mmengine.config import read_base
with read_base():
# LLM compression datasets
from opencompass.configs.datasets.llm_compression.llm_compression import llm_compression_datasets
# Model configs
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import LLMCompressionSummarizer
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
# -------------Inference Stage ----------------------------------------
datasets = [*llm_compression_datasets]
workdir = 'outputs/llm_compression'
models = [
*qwen1_5_7b,
*qwen1_5_14b,
*llama2_7b,
*llama2_13b,
]
# Set custom batch_size and num_gpus for faster loss calculation
# Smaller batch_size should give more precise results, at the cost of worse performance
model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
for mdl in models:
mdl.update(model_cfg)
infer = dict(
# The OpenCompass implementation of BPC currently only supports NaivePartitioner, as the sliding window approach requires the dataset to be loaded sequentially. Using other partitioner types may produce incorrect results.
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=256, # Maximum concurrent evaluation task count
),
)
# -------------Evaluation Stage ----------------------------------------
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLEvalTask),
max_num_workers=256,
))
# -------------Summarization Stage ----------------------------------------
summarizer = dict(type=LLMCompressionSummarizer)
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
# Import pre-configured models from OpenCompass
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct_model,
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import CustomDataset
# Dataset reader configuration
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
# Inference configuration
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Template for the LLM judge
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration using LLM as judge
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
),
judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
# Dataset configuration
datasets = [
dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
# Model to be evaluated
models = lmdeploy_qwen2_5_7b_instruct_model
# Limiting test to first 8 examples for quick testing
math_reader_cfg['test_range'] = '[0:8]'
# Output directory
work_dir = 'outputs/llm_judge'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
gsm8k_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
models
datasets = gsm8k_datasets
models = models
from mmengine.config import read_base
with read_base():
# Models
# Datasets
from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
LongBenchv2_datasets as LongBenchv2_datasets
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
models as lmdeploy_glm4_9b_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct_model
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 128 * 1024
model['engine_config']['session_len'] = 128 * 1024
model['engine_config']['tp'] = 2
model['run_cfg']['num_gpus'] = 2
# Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
# Drop middle now only support LMDeploy models
model['drop_middle'] = True
work_dir = './outputs/longbenchv2'
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
from mmengine.config import read_base
with read_base():
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
from opencompass.configs.datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
from opencompass.datasets import math_judement_preprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import AllObjSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Prompt Settings ----------------------------------------
eng_obj_prompt = """
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
Examples:
Expression 1: $2x+3$
Expression 2: $3+2x$
[Yes]
Expression 1: 3/2
Expression 2: 1.5
[Yes]
Expression 1: $x^2+2x+1$
Expression 2: $y^2+2y+1$
[No]
Expression 1: $x^2+2x+1$
Expression 2: $(x+1)^2$
[Yes]
Expression 1: 3245/5
Expression 2: 649
[No]
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
Expression 1: 2/(-3)
Expression 2: -2/3
[Yes]
(trivial simplifications are allowed)
Expression 1: 72 degrees
Expression 2: 72
[Yes]
(give benefit of the doubt to units)
Expression 1: 64
Expression 2: 64 square feet
[Yes]
(give benefit of the doubt to units)
Expression 1: 64
Expression 2:
[No]
(only mark as equivalent if both expressions are nonempty)
---
YOUR TASK
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
Expression 1: {obj_gold}
Expression 2: {prediction}
"""
# -------------Inferen Stage ----------------------------------------
# eval models
models = [*hf_llama3_8b_instruct_model]
# judge models
judge_models = hf_llama3_70b_instruct_model
eng_datasets = [*math_datasets]
chn_datasets = []
datasets = eng_datasets + chn_datasets
work_dir = 'outputs/obj_all/'
for d in eng_datasets:
d['eval_cfg'] = dict(
evaluator=dict(
type=LMEvaluator,
# If you need to preprocess the prediction before judging,
# you can specify the pred_postprocessor function here
pred_postprocessor=dict(type=math_judement_preprocess),
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=eng_obj_prompt),
]),
),
),
pred_role='BOT',
)
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# ------------- Evaluation Configuration --------------------------------
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=80000,
mode='singlescore',
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=AllObjSummarizer)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import \
math_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as qwen2_5_72b_instruct_model
eval_model_name = 'eval_model_name'
postprocessor_model_name = 'postprocessor_model_name'
eval_model_urls = ['http://0.0.0.0:23333/v1']
postprocessor_model_urls = ['http://0.0.0.0:23333/v1']
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for dataset in datasets:
dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
dataset['eval_cfg']['evaluator'][
'post_model_name'] = postprocessor_model_name
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from opencompass.configs.datasets.math.math_500_gen import math_datasets
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
gen_config=dict(
top_k=1,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
]
datasets = [*math_datasets]
work_dir = './outputs/math_500'
from mmengine.config import read_base
with read_base():
# Import models
# Import datasets
from opencompass.configs.datasets.MathBench.mathbench_gen import \
mathbench_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as internlm2_chat_7b_model
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as llama3_8b_instruct_model
# Import summarizers for display results
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
summarizer # Grouped results for MathBench-A and MathBench-T separately
# from opencompass.configs.summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset
# from opencompass.configs.summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
work_dir = './outputs/mathbench_results'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
mmlu_cf_datasets
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
models as hf_qwen2_5_7b_instruct_model
from opencompass.configs.summarizers.mmlu_cf import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
work_dir = 'outputs/debug/mmlu_cf'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
mmlu_pro_datasets
from opencompass.configs.internal.clusters.local import eval
from opencompass.configs.internal.clusters.local import \
infer_num_worker as infer
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model
from opencompass.configs.summarizers.mmlu_pro import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = 'outputs/debug/mmlu_pro'
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind
# ------------------------- --------- ------------- ------ ----------------------------- -------------------------------
# mmlu_pro - naive_average gen 46.18 43.92
# mmlu_pro_biology 736233 accuracy gen 63.74 64.02
# mmlu_pro_business 736233 accuracy gen 53.23 46.01
# mmlu_pro_chemistry 736233 accuracy gen 35.25 32.42
# mmlu_pro_computer_science 736233 accuracy gen 47.07 44.88
# mmlu_pro_economics 736233 accuracy gen 59.00 53.79
# mmlu_pro_engineering 736233 accuracy gen 26.73 33.54
# mmlu_pro_health 736233 accuracy gen 47.31 51.34
# mmlu_pro_history 736233 accuracy gen 42.78 42.26
# mmlu_pro_law 736233 accuracy gen 28.07 26.98
# mmlu_pro_math 736233 accuracy gen 53.59 37.53
# mmlu_pro_philosophy 736233 accuracy gen 42.28 42.48
# mmlu_pro_physics 736233 accuracy gen 39.11 33.64
# mmlu_pro_psychology 736233 accuracy gen 60.90 59.65
# mmlu_pro_other 736233 accuracy gen 47.40 46.32
from copy import deepcopy
from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
with read_base():
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
mmlu_datasets # this is a dataset evaluated with 5-shot
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
datasets = []
for d in mmlu_datasets:
d = deepcopy(d)
d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
datasets.append(d)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment