Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.lveval.lveval import \
LVEval_datasets as datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models
from opencompass.configs.summarizers.lveval import summarizer
models[0]['path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
models[0][
'tokenizer_path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
models[0]['max_seq_len'] = 4096
models[0]['generation_kwargs'] = dict(do_sample=False)
models[0]['mode'] = 'mid' # truncate in the middle
from mmengine.config import read_base
with read_base():
from opencompass.configs.dataset_collections.chat_OC15 import datasets
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model
from opencompass.configs.summarizers.chat_OC15 import summarizer
work_dir = 'outputs/debug/llama3-instruct'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# dataset version metric mode llama-3-8b-instruct-hf
# -------------------- --------- ---------------------------- ------ ------------------------
# average - naive_average gen 55.64
# mmlu - naive_average gen 68.30
# cmmlu - naive_average gen 53.29
# ceval - naive_average gen 52.32
# GaokaoBench - weighted_average gen 45.91
# triviaqa_wiki_1shot eaf81e score gen 79.01
# nq_open_1shot 01cf41 score gen 30.25
# race-high 9a54b6 accuracy gen 81.22
# winogrande b36770 accuracy gen 66.46
# hellaswag e42710 accuracy gen 74.33
# bbh - naive_average gen 67.25
# gsm8k 1d7fe4 accuracy gen 79.08
# math 393424 accuracy gen 27.78
# TheoremQA 6f0af8 score gen 19.50
# openai_humaneval 8e312c humaneval_pass@1 gen 55.49
# sanitized_mbpp 830460 score gen 66.54
# GPQA_diamond 4baadb accuracy gen 25.76
# IFEval 3321a3 Prompt-level-strict-accuracy gen 67.84
# - - - -
# mmlu - naive_average gen 68.30
# mmlu-stem - naive_average gen 57.92
# mmlu-social-science - naive_average gen 77.83
# mmlu-humanities - naive_average gen 71.20
# mmlu-other - naive_average gen 71.79
# cmmlu - naive_average gen 53.29
# cmmlu-stem - naive_average gen 45.40
# cmmlu-social-science - naive_average gen 54.63
# cmmlu-humanities - naive_average gen 54.14
# cmmlu-other - naive_average gen 59.52
# cmmlu-china-specific - naive_average gen 49.33
# ceval - naive_average gen 52.32
# ceval-stem - naive_average gen 48.16
# ceval-social-science - naive_average gen 57.50
# ceval-humanities - naive_average gen 53.26
# ceval-other - naive_average gen 54.26
# ceval-hard - naive_average gen 35.59
from mmengine.config import read_base
with read_base():
# LLM compression datasets
from opencompass.configs.datasets.llm_compression.llm_compression import llm_compression_datasets
# Model configs
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import LLMCompressionSummarizer
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
# -------------Inference Stage ----------------------------------------
datasets = [*llm_compression_datasets]
workdir = 'outputs/llm_compression'
models = [
*qwen1_5_7b,
*qwen1_5_14b,
*llama2_7b,
*llama2_13b,
]
# Set custom batch_size and num_gpus for faster loss calculation
# Smaller batch_size should give more precise results, at the cost of worse performance
model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
for mdl in models:
mdl.update(model_cfg)
infer = dict(
# The OpenCompass implementation of BPC currently only supports NaivePartitioner, as the sliding window approach requires the dataset to be loaded sequentially. Using other partitioner types may produce incorrect results.
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=256, # Maximum concurrent evaluation task count
),
)
# -------------Evaluation Stage ----------------------------------------
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLEvalTask),
max_num_workers=256,
))
# -------------Summarization Stage ----------------------------------------
summarizer = dict(type=LLMCompressionSummarizer)
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
# Import pre-configured models from OpenCompass
with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct_model,
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import CustomDataset
# Dataset reader configuration
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
# Inference configuration
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Template for the LLM judge
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration using LLM as judge
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
),
judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
# Dataset configuration
datasets = [
dict(
type=CustomDataset,
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
# Model to be evaluated
models = lmdeploy_qwen2_5_7b_instruct_model
# Limiting test to first 8 examples for quick testing
math_reader_cfg['test_range'] = '[0:8]'
# Output directory
work_dir = 'outputs/llm_judge'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
gsm8k_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
models
datasets = gsm8k_datasets
models = models
from mmengine.config import read_base
with read_base():
# Models
# Datasets
from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
LongBenchv2_datasets as LongBenchv2_datasets
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
models as lmdeploy_glm4_9b_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct_model
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 128 * 1024
model['engine_config']['session_len'] = 128 * 1024
model['engine_config']['tp'] = 2
model['run_cfg']['num_gpus'] = 2
# Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
# Drop middle now only support LMDeploy models
model['drop_middle'] = True
work_dir = './outputs/longbenchv2'
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
from mmengine.config import read_base
with read_base():
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
from opencompass.configs.datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
from opencompass.datasets import math_judement_preprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import AllObjSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Prompt Settings ----------------------------------------
eng_obj_prompt = """
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
Examples:
Expression 1: $2x+3$
Expression 2: $3+2x$
[Yes]
Expression 1: 3/2
Expression 2: 1.5
[Yes]
Expression 1: $x^2+2x+1$
Expression 2: $y^2+2y+1$
[No]
Expression 1: $x^2+2x+1$
Expression 2: $(x+1)^2$
[Yes]
Expression 1: 3245/5
Expression 2: 649
[No]
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
Expression 1: 2/(-3)
Expression 2: -2/3
[Yes]
(trivial simplifications are allowed)
Expression 1: 72 degrees
Expression 2: 72
[Yes]
(give benefit of the doubt to units)
Expression 1: 64
Expression 2: 64 square feet
[Yes]
(give benefit of the doubt to units)
Expression 1: 64
Expression 2:
[No]
(only mark as equivalent if both expressions are nonempty)
---
YOUR TASK
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
Expression 1: {obj_gold}
Expression 2: {prediction}
"""
# -------------Inferen Stage ----------------------------------------
# eval models
models = [*hf_llama3_8b_instruct_model]
# judge models
judge_models = hf_llama3_70b_instruct_model
eng_datasets = [*math_datasets]
chn_datasets = []
datasets = eng_datasets + chn_datasets
work_dir = 'outputs/obj_all/'
for d in eng_datasets:
d['eval_cfg'] = dict(
evaluator=dict(
type=LMEvaluator,
# If you need to preprocess the prediction before judging,
# you can specify the pred_postprocessor function here
pred_postprocessor=dict(type=math_judement_preprocess),
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=eng_obj_prompt),
]),
),
),
pred_role='BOT',
)
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# ------------- Evaluation Configuration --------------------------------
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=80000,
mode='singlescore',
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=AllObjSummarizer)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import \
math_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as qwen2_5_72b_instruct_model
eval_model_name = 'eval_model_name'
postprocessor_model_name = 'postprocessor_model_name'
eval_model_urls = ['http://0.0.0.0:23333/v1']
postprocessor_model_urls = ['http://0.0.0.0:23333/v1']
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for dataset in datasets:
dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
dataset['eval_cfg']['evaluator'][
'post_model_name'] = postprocessor_model_name
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from opencompass.configs.datasets.math.math_500_gen import math_datasets
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
gen_config=dict(
top_k=1,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
]
datasets = [*math_datasets]
work_dir = './outputs/math_500'
from mmengine.config import read_base
with read_base():
# Import models
# Import datasets
from opencompass.configs.datasets.MathBench.mathbench_gen import \
mathbench_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as internlm2_chat_7b_model
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as llama3_8b_instruct_model
# Import summarizers for display results
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
summarizer # Grouped results for MathBench-A and MathBench-T separately
# from opencompass.configs.summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset
# from opencompass.configs.summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
work_dir = './outputs/mathbench_results'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
mmlu_cf_datasets
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
models as hf_qwen2_5_7b_instruct_model
from opencompass.configs.summarizers.mmlu_cf import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)),
)
work_dir = 'outputs/debug/mmlu_cf'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
mmlu_pro_datasets
from opencompass.configs.internal.clusters.local import eval
from opencompass.configs.internal.clusters.local import \
infer_num_worker as infer
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model
from opencompass.configs.summarizers.mmlu_pro import summarizer
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = 'outputs/debug/mmlu_pro'
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind
# ------------------------- --------- ------------- ------ ----------------------------- -------------------------------
# mmlu_pro - naive_average gen 46.18 43.92
# mmlu_pro_biology 736233 accuracy gen 63.74 64.02
# mmlu_pro_business 736233 accuracy gen 53.23 46.01
# mmlu_pro_chemistry 736233 accuracy gen 35.25 32.42
# mmlu_pro_computer_science 736233 accuracy gen 47.07 44.88
# mmlu_pro_economics 736233 accuracy gen 59.00 53.79
# mmlu_pro_engineering 736233 accuracy gen 26.73 33.54
# mmlu_pro_health 736233 accuracy gen 47.31 51.34
# mmlu_pro_history 736233 accuracy gen 42.78 42.26
# mmlu_pro_law 736233 accuracy gen 28.07 26.98
# mmlu_pro_math 736233 accuracy gen 53.59 37.53
# mmlu_pro_philosophy 736233 accuracy gen 42.28 42.48
# mmlu_pro_physics 736233 accuracy gen 39.11 33.64
# mmlu_pro_psychology 736233 accuracy gen 60.90 59.65
# mmlu_pro_other 736233 accuracy gen 47.40 46.32
from copy import deepcopy
from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
with read_base():
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
mmlu_datasets # this is a dataset evaluated with 5-shot
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
datasets = []
for d in mmlu_datasets:
d = deepcopy(d)
d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
datasets.append(d)
# export DATASET_SOURCE='ModelScope' # before run this script
from datasets import Dataset, DatasetDict
from mmengine.config import read_base
from tqdm import tqdm
with read_base():
from opencompass.configs.datasets.agieval.agieval_gen import \
agieval_datasets as agieval_v2_datasets # ok
from opencompass.configs.datasets.agieval.agieval_gen_a0c741 import \
agieval_datasets as agieval_v1_datasets # ok
from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
ARC_c_datasets as ARC_c_clean_datasets # ok
from opencompass.configs.datasets.ARC_c.ARC_c_gen import \
ARC_c_datasets # ok
from opencompass.configs.datasets.ARC_e.ARC_e_gen import \
ARC_e_datasets # ok
from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
ceval_datasets as ceval_clean_datasets # ok
from opencompass.configs.datasets.ceval.ceval_gen import \
ceval_datasets # ok
from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen import \
afqmc_datasets # ok
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import \
cmnli_datasets # ok
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import \
cmnli_datasets as cmnli_ppl_datasets # ok
from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import \
ocnli_datasets # ok
from opencompass.configs.datasets.cmmlu.cmmlu_gen import \
cmmlu_datasets # ok
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen import \
commonsenseqa_datasets # 额外处理gpt
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen import \
GaokaoBench_datasets # ok
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_mixed import \
GaokaoBench_datasets as GaokaoBench_mixed_datasets # ok
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets as GaokaoBench_no_subjective_datasets # ok
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets as hellaswag_ice_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
hellaswag_datasets as hellaswag_clean_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_gen import \
hellaswag_datasets as hellaswag_v2_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import \
hellaswag_datasets as hellaswag_v1_datasets # ok
from opencompass.configs.datasets.hellaswag.hellaswag_ppl_a6e128 import \
hellaswag_datasets as hellaswag_v3_datasets # ok
from opencompass.configs.datasets.humaneval.humaneval_gen import \
humaneval_datasets # ok
from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
humaneval_datasets as humaneval_repeat10_datasets # ok
from opencompass.configs.datasets.lambada.lambada_gen import \
lambada_datasets # ok
from opencompass.configs.datasets.lcsts.lcsts_gen import \
lcsts_datasets # ok
from opencompass.configs.datasets.math.math_gen import math_datasets # ok
from opencompass.configs.datasets.mbpp.mbpp_gen import \
mbpp_datasets as mbpp_v1_datasets # ok
from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import \
mbpp_datasets as mbpp_v2_datasets # ok
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \
sanitized_mbpp_datasets # ok
from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import \
mmlu_datasets as mmlu_clean_datasets # ok
from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets # ok
from opencompass.configs.datasets.nq.nq_gen import nq_datasets # ok
from opencompass.configs.datasets.obqa.obqa_gen import obqa_datasets # ok
from opencompass.configs.datasets.obqa.obqa_ppl_6aac9e import \
obqa_datasets as obqa_ppl_datasets # ok
from opencompass.configs.datasets.piqa.piqa_gen import \
piqa_datasets as piqa_v2_datasets # ok
from opencompass.configs.datasets.piqa.piqa_ppl import \
piqa_datasets as piqa_v1_datasets # ok
from opencompass.configs.datasets.piqa.piqa_ppl_0cfff2 import \
piqa_datasets as piqa_v3_datasets # ok
from opencompass.configs.datasets.race.race_ppl import race_datasets # ok
from opencompass.configs.datasets.siqa.siqa_gen import \
siqa_datasets as siqa_v2_datasets # ok
from opencompass.configs.datasets.siqa.siqa_gen_18632c import \
siqa_datasets as siqa_v3_datasets # ok
from opencompass.configs.datasets.siqa.siqa_ppl_42bc6e import \
siqa_datasets as siqa_ppl_datasets # ok
from opencompass.configs.datasets.storycloze.storycloze_gen import \
storycloze_datasets # ok
from opencompass.configs.datasets.storycloze.storycloze_ppl import \
storycloze_datasets as storycloze_ppl_datasets # ok
from opencompass.configs.datasets.strategyqa.strategyqa_gen import \
strategyqa_datasets
from opencompass.configs.datasets.summedits.summedits_gen import \
summedits_datasets as summedits_v2_datasets # ok
from opencompass.configs.datasets.triviaqa.triviaqa_gen import \
triviaqa_datasets # ok
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets as triviaqa_wiki_1shot_datasets # ok
from opencompass.configs.datasets.tydiqa.tydiqa_gen import \
tydiqa_datasets # ok
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets as winogrande_5shot_ll_datasets # ok
from opencompass.configs.datasets.winogrande.winogrande_gen import \
winogrande_datasets
from opencompass.configs.datasets.winogrande.winogrande_ll import \
winogrande_datasets as winogrande_ll_datasets # ok
from opencompass.configs.datasets.Xsum.Xsum_gen import Xsum_datasets
from opencompass.configs.models.opt.hf_opt_125m import models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
for d in datasets:
d['reader_cfg'].update({'train_range': '[0:5]', 'test_range': '[0:5]'})
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
with read_base():
from opencompass.configs.datasets.winogrande.winogrande_gen_a027b6 import \
winogrande_datasets
datasets = [*winogrande_datasets]
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm-chat-7b-hf',
path='internlm/internlm-chat-7b',
tokenizer_path='internlm/internlm-chat-7b',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
_winogrande_all = [d['abbr'] for d in winogrande_datasets]
summarizer = dict(summary_groups=[
{
'name': 'winogrande',
'subsets': _winogrande_all
},
{
'name': 'winogrande_std',
'subsets': _winogrande_all,
'std': True
},
])
import os.path as osp
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
models as lmdeploy_glm4_9b_chat_model
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
models as lmdeploy_gemma_27b_it_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
models as lmdeploy_ministral_8b_instruct_2410_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
models as lmdeploy_qwen2_5_32b_instruct_model
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
models as lmdeploy_yi_1_5_9b_chat_model
from opencompass.configs.summarizers.groups.musr_average import summarizer
datasets = [*musr_datasets]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
base_exp_dir = 'outputs/musr/'
work_dir = osp.join(base_exp_dir, 'musr_eval')
from mmengine.config import read_base
# we use mmengine.config to import other config files
with read_base():
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
# Evaluate needlebench_32k, adjust the configuration to use 4k, 32k, 128k, 200k, or 1000k if necessary.
# from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_32k import needlebench_datasets
# from opencompass.configs.summarizers.needlebench import needlebench_32k_summarizer as summarizer
# only eval original "needle in a haystack test" in needlebench_32k
from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_single_32k import needlebench_zh_datasets, needlebench_en_datasets
from opencompass.configs.summarizers.needlebench import needlebench_v2_32k_summarizer as summarizer
# eval Ancestral Tracing Challenge(ATC)
# from opencompass.configs.datasets.needlebench_v2.atc.atc_0shot_nocot_2_power_en import needlebench_datasets
# ATC use default summarizer thus no need to import summarizer
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
for m in internlm2_chat_7b:
m['max_seq_len'] = 32768 # Ensure InternLM2-7B model can receive the full long text; for other models, adjust according to their supported maximum sequence length.
m['max_out_len'] = 4096
models = internlm2_chat_7b
work_dir = './outputs/needlebench'
\ No newline at end of file
import os.path as osp
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.runners import LocalRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
#######################################################################
# PART 0 Meta Info #
#######################################################################
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
judge_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct',
key='sk-1234',
openai_api_base=[
'http://x.x.x.x:4000/v1',
],
meta_template=api_meta_template,
query_per_second=8,
batch_size=256,
temperature=0.001,
# max_completion_tokens=32768,
tokenizer_path='gpt-4o-2024-05-13',
# verbose=True,
max_out_len=16384,
max_seq_len=32768,
# max_seq_len=49152,
mode='mid',
retry=10
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
repeated_info = [
(math_datasets, 4),
(aime2024_datasets, 32),
(aime2025_datasets, 32),
]
for datasets_, num in repeated_info:
for dataset_ in datasets_:
dataset_['n'] = num
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 32768
try:
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
except:
pass
#######################################################################
# PART 2 Dataset Summarizer #
#######################################################################
summarizer = dict(
dataset_abbrs=[
'MATH',
['math_prm800k_500', 'accuracy (4 runs average)'],
['aime2024', 'accuracy (32 runs average)'],
['aime2025', 'accuracy (32 runs average)'],
['livemathbench_hard', 'naive_average'],
['OlympiadBenchMath', 'accuracy'],
['olymmath', 'naive_average'],
],
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models += [
dict(
abbr='Qwen_Qwen3-235B-A22B',
type=OpenAISDK,
path='Qwen/Qwen3-235B-A22B',
key='sk-admin',
openai_api_base=[
'http://106.15.231.215:40007/v1/',
],
meta_template=dict(
# begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
round=[
dict(role='HUMAN', api_role='HUMAN'),
# XXX: all system roles are mapped to human in purpose
dict(role='BOT', api_role='BOT', generate=True),
]
),
query_per_second=16,
batch_size=128,
# batch_size=1,
temperature=0.6,
# max_completion_tokens=32768,
tokenizer_path='gpt-4',
# verbose=True,
max_out_len=32768,
max_seq_len=32768,
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
]
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
)
base_exp_dir = 'outputs/qwen3_reasoning'
work_dir = osp.join(base_exp_dir, 'chat_objective')
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.leaderboard.qwen import \
datasets
from opencompass.configs.models.qwen.hf_qwen_7b import models
from opencompass.configs.summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-hf
-------------------------------------- --------- ---------------- ------ ------------
--------- 考试 Exam --------- - - - -
ceval - naive_average ppl 58.65
agieval - naive_average mixed 40.49
mmlu - naive_average ppl 57.78
cmmlu - naive_average ppl 58.57
GaokaoBench - weighted_average mixed 51.76
ARC-c 72cf91 accuracy gen 83.73
ARC-e 72cf91 accuracy gen 90.65
--------- 语言 Language --------- - - - -
WiC ce62e6 accuracy ppl 51.10
chid-dev 25f3d3 accuracy ppl 86.63
afqmc-dev cc328c accuracy ppl 69.00
WSC 678cb5 accuracy ppl 63.46
tydiqa-goldp - naive_average gen 19.98
flores_100 - naive_average gen 3.20
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.00
commonsense_qa 0d8e25 accuracy ppl 67.49
triviaqa b6904f score gen 40.45
nq b6904f score gen 14.16
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 75.29
race-middle 73bdec accuracy ppl 90.53
race-high 73bdec accuracy ppl 87.71
openbookqa_fact fa871c accuracy gen 92.20
csl_dev 3c4211 accuracy ppl 56.25
lcsts 0b3969 rouge1 gen 12.38
Xsum 207e69 rouge1 gen 36.00
eprstmt-dev 101429 accuracy gen 89.38
lambada de1af2 accuracy gen 67.88
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 54.85
ocnli 1471e7 accuracy gen 42.34
AX_b 793c72 accuracy gen 58.61
AX_g c4c886 accuracy gen 69.10
RTE c4c886 accuracy gen 57.76
COPA 59f42c accuracy gen 88.00
ReCoRD 3e0689 score gen 27.78
hellaswag 06a1e2 accuracy gen 92.47
piqa 24369d accuracy gen 78.02
siqa ea30d1 accuracy ppl 75.03
math 2c0b9e accuracy gen 11.06
gsm8k 4c7f6e accuracy gen 50.87
drop 53a0a7 score gen 44.95
openai_humaneval dd0dff humaneval_pass@1 gen 23.78
mbpp 60ca11 score gen 31.20
bbh - naive_average gen 40.03
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
datasets
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
from opencompass.configs.summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-chat-hf
-------------------------------------- --------- ---------------- ------ -----------------
--------- 考试 Exam --------- - - - -
ceval - naive_average gen 56.07
agieval - naive_average mixed 39.51
mmlu - naive_average gen 53.49
cmmlu - naive_average gen 55.29
GaokaoBench - weighted_average gen 48.01
ARC-c ca1e8e accuracy ppl 74.92
ARC-e ca1e8e accuracy ppl 85.71
--------- 语言 Language --------- - - - -
WiC efbd01 accuracy gen 51.41
chid-dev 25f3d3 accuracy ppl 77.72
afqmc-dev 4a1636 accuracy gen 69.00
WSC 678cb5 accuracy ppl 67.31
tydiqa-goldp - naive_average gen 15.32
flores_100 - naive_average gen 10.00
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.18
commonsense_qa ddaabf accuracy gen 76.41
triviaqa b6904f score gen 43.25
nq 23dc1a score gen 16.26
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 81.53
race-middle e0908b accuracy gen 83.01
race-high e0908b accuracy gen 77.79
openbookqa_fact 49689a accuracy ppl 86.40
csl_dev 3c4211 accuracy ppl 64.38
lcsts 0b3969 rouge1 gen 12.75
Xsum 207e69 rouge1 gen 20.21
eprstmt-dev ed0c5d accuracy ppl 85.00
lambada de1af2 accuracy gen 59.19
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 48.08
ocnli 15e783 accuracy ppl 51.40
AX_b 689df1 accuracy ppl 65.67
AX_g 808a19 accuracy ppl 76.12
RTE 808a19 accuracy ppl 68.95
COPA 59f42c accuracy gen 92.00
ReCoRD 6f7cfc score gen 0.16
hellaswag 8d79e0 accuracy ppl 69.28
piqa 34eee7 accuracy ppl 72.20
siqa ea30d1 accuracy ppl 72.88
math 2c0b9e accuracy gen 7.84
gsm8k 4c7f6e accuracy gen 45.41
drop 53a0a7 score gen 39.62
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
mbpp 60ca11 score gen 20.60
bbh - naive_average gen 42.61
'''
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment