Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
with read_base():
# Inference PPL datasets
from opencompass.configs.datasets.inference_ppl.inference_ppl import inference_ppl_datasets
# Model configs
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
# -------------Inference Stage ----------------------------------------
datasets = [*inference_ppl_datasets]
workdir = 'outputs/inference_ppl'
models = [
*qwen1_5_7b,
*qwen1_5_14b,
*llama2_7b,
*llama2_13b,
]
# Set custom batch_size and num_gpus for faster loss calculation
# Smaller batch_size should give more precise results, at the cost of worse efficiency
model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
for mdl in models:
mdl.update(model_cfg)
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=256, # Maximum concurrent evaluation task count
),
)
# -------------Evaluation Stage ----------------------------------------
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLEvalTask),
max_num_workers=256,
))
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.base_medium import datasets
# choose a model of interest
from opencompass.configs.models.internlm.internlm_7b import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
from copy import deepcopy
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import \
agieval_datasets
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.math.math_evaluatorv2_gen_cecb31 import \
math_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as hf_internlm2_chat_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
models as hf_internlm2_chat_20b_model
from opencompass.configs.summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-chat-keyset/'
_origin_datasets = sum(
[v for k, v in locals().items() if k.endswith('_datasets')], [])
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
_vanilla_datasets = [deepcopy(d) for d in _origin_datasets]
_vanilla_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if any(r['role'] == 'SYSTEM' for r in round):
new_round = [r for r in round if r['role'] != 'SYSTEM']
print(
f'WARNING: remove SYSTEM round in meta_template for {m.get("abbr", None)}'
)
m['meta_template']['round'] = new_round
_vanilla_models.append(m)
datasets = _vanilla_datasets
models = _vanilla_models
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.agieval.agieval_mixed_713d14 import \
agieval_datasets
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b_model
from opencompass.configs.summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-keyset/'
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# To run this example, you need to do the following steps:
# 1. Install latest opencompass
# 2. Start a local server with Qwen2.5-72B-Instruct as LLMJudge server (i.e. using vLLM or LMDeploy)
# 3. Change the judge_cfg openai_api_base to your corresponindg local server address
# 4. Start this evaluation by running 'opencompass eval_internlm3_math500_thinking.py'
from opencompass.models import VLLMwithChatTemplate, OpenAISDK
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.math.math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000 import (
math_datasets,
)
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
judge_cfg = dict(
abbr='qwen2-5-72b-instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-72B-Instruct',
key='YOUR_API_KEY',
openai_api_base=[
'http://172.30.56.81:23333/v1/', ### Change to your own server
],
meta_template=api_meta_template,
query_per_second=16,
batch_size=16,
temperature=0.001,
max_seq_len=32768,
max_completion_tokens=32768,
)
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# set max_out_len for inference
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 16384
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
reasoning_chat_template = """You are an expert mathematician with extensive experience in mathematical competitions. You approach problems through systematic thinking and rigorous reasoning. When solving problems, follow these thought processes:
## Deep Understanding
Take time to fully comprehend the problem before attempting a solution. Consider:
- What is the real question being asked?
- What are the given conditions and what do they tell us?
- Are there any special restrictions or assumptions?
- Which information is crucial and which is supplementary?
## Multi-angle Analysis
Before solving, conduct thorough analysis:
- What mathematical concepts and properties are involved?
- Can you recall similar classic problems or solution methods?
- Would diagrams or tables help visualize the problem?
- Are there special cases that need separate consideration?
## Systematic Thinking
Plan your solution path:
- Propose multiple possible approaches
- Analyze the feasibility and merits of each method
- Choose the most appropriate method and explain why
- Break complex problems into smaller, manageable steps
## Rigorous Proof
During the solution process:
- Provide solid justification for each step
- Include detailed proofs for key conclusions
- Pay attention to logical connections
- Be vigilant about potential oversights
## Repeated Verification
After completing your solution:
- Verify your results satisfy all conditions
- Check for overlooked special cases
- Consider if the solution can be optimized or simplified
- Review your reasoning process
Remember:
1. Take time to think thoroughly rather than rushing to an answer
2. Rigorously prove each key conclusion
3. Keep an open mind and try different approaches
4. Summarize valuable problem-solving methods
5. Maintain healthy skepticism and verify multiple times
Your response should reflect deep mathematical understanding and precise logical thinking, making your solution path and reasoning clear to others.
When you're ready, present your complete solution with:
- Clear problem understanding
- Detailed solution process
- Key insights
- Thorough verification
Focus on clear, logical progression of ideas and thorough explanation of your mathematical reasoning. Provide answers in the same language as the user asking the question, repeat the final answer using a '\\boxed{}' without any units, you have [[8192]] tokens to complete the answer.
"""
reasoning_meta_template = dict(
begin=dict(
role='SYSTEM', api_role='SYSTEM', prompt=reasoning_chat_template
),
round=[
dict(role='HUMAN', api_role='HUMAN'),
# XXX: all system roles are mapped to human in purpose
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(
type=VLLMwithChatTemplate,
abbr='internlm3-8b-instruct-vllm',
path='internlm/internlm3-8b-instruct',
model_kwargs=dict(tensor_parallel_size=1),
generation_kwargs=dict(do_sample=False), # greedy
max_seq_len=32768,
max_out_len=16384,
batch_size=16,
run_cfg=dict(num_gpus=1),
meta_template=reasoning_meta_template,
)
]
datasets = math_datasets
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.base_medium import datasets
# choose a model of interest
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
from mmengine.config import read_base
from opencompass.models.turbomind_api import TurboMindAPIModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
crowspairs_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
api_key='internlm-chat-20b', # api_key
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
api_key='interlm-chat-7b', # api_key
max_out_len=100,
max_seq_len=2048,
batch_size=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
models = [internlm_chat_20b]
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
crowspairs_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
internlm2_meta_template = dict(round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT',
begin='<|im_start|>assistant\n',
end='<|im_end|>\n',
generate=True),
],
eos_token_id=92542)
# config for internlm-chat-7b
internlm_chat_7b = dict(
type=TurboMindModel,
abbr='internlm-chat-7b-turbomind',
path='internlm/internlm-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
# config for internlm-chat-7b
internlm2_chat_7b = dict(type=TurboMindModel,
abbr='internlm2-chat-7b-turbomind',
path='internlm/internlm2-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
meta_template=internlm2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>')
# config for internlm-chat-20b
internlm_chat_20b = dict(
type=TurboMindModel,
abbr='internlm-chat-20b-turbomind',
path='internlm/internlm-chat-20b',
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
models = [internlm_chat_20b]
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import FlamesSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Inferen Stage ----------------------------------------
with read_base():
from opencompass.configs.datasets.flames.flames_gen import flames_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models
datasets = [*flames_datasets]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT',
begin='<|im_start|>assistant\n',
end='<|im_end|>\n',
generate=True),
], )
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-chat-7b-hf',
path='internlm/internlm2-chat-7b',
tokenizer_path='internlm/internlm2-chat-7b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
generation_kwargs={
'eos_token_id': [2, 92542],
'do_sample': True
},
batch_padding=True,
)
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration---------------------------------
internlm1_chat_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
judge_models = [
dict(
type=HuggingFaceCausalLM,
abbr='flames-scorer',
path='CaasiHUANG/flames-scorer',
tokenizer_path='CaasiHUANG/flames-scorer',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
generation_kwargs={'do_sample': True},
max_out_len=512,
max_seq_len=4096,
batch_size=8,
meta_template=internlm1_chat_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]
## ------------- Evaluation Configuration----------------
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='singlescore',
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=FlamesSummarizer, judge_type='general')
work_dir = 'outputs/flames/'
from mmengine.config import read_base
from opencompass.models.turbomind_api import TurboMindAPIModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1, num_procs=1),
)
models = [internlm_chat_20b]
from mmengine.config import read_base
from opencompass.models.huggingface import HuggingFaceCausalLM
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_736506 import math_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_7b import \
models as internlm_math_chat_7b_models
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_20b import \
models as internlm_math_chat_20b_models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# Eval Math and GSM8k for both Internlm-Math-Chat-7B and 20b
datasets = [*math_datasets, *gsm8k_datasets]
models = [*internlm_math_chat_7b_models, *internlm_math_chat_20b_models]
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
WiC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
triviaqa_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# # config for internlm-7b model
internlm_7b = dict(
type=TurboMindModel,
abbr='internlm-7b-turbomind',
path='internlm/internlm-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for internlm-20b model
internlm_20b = dict(
type=TurboMindModel,
abbr='internlm-20b-turbomind',
path='internlm/internlm-20b',
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
models = [internlm_20b]
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
from opencompass.configs.summarizers.judgedataset_all import summarizer
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.models import TurboMindModelwithChatTemplate
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judge_dataset_all/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_judgebench_datasets]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judgebench/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-7B-Instruct',
path='opencompass/CompassJudger-1-7B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]
datasets = judgerbench_datasets
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner,
n=10,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
work_dir = 'outputs/judgerbench/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
from opencompass.configs.summarizers.judgerbenchv2 import summarizer
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_judgerbenchv2_dataset]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
# partitioner=dict(type=NaivePartitioner),
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/judgerbenchv2/'
from mmengine import read_base
with read_base():
from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
korbench_mixed_datasets as mixed_datasets
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets as zero_shot_datasets
from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
korbench_3shot_single_datasets as three_shot_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b
datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
models = hf_internlm2_5_7b
from mmengine.config import read_base
from opencompass.models import LightllmAPI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
humaneval_datasets
from opencompass.configs.summarizers.leaderboard import summarizer
datasets = [*humaneval_datasets]
'''
# Prompt template for InternLM2-Chat
# https://github.com/InternLM/InternLM/blob/main/chat/chat_format.md
_meta_template = dict(
begin='<|im_start|>system\nYou are InternLM2-Chat, a harmless AI assistant<|im_end|>\n',
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
]
)
'''
_meta_template = None
models = [
dict(
abbr='LightllmAPI',
type=LightllmAPI,
url='http://localhost:1030/generate',
meta_template=_meta_template,
batch_size=32,
max_workers_per_task=128,
rate_per_worker=1024,
retry=4,
generation_kwargs=dict(do_sample=False,
ignore_eos=False,
max_new_tokens=1024),
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLInferTask),
),
)
from mmengine.config import read_base
from opencompass.models import OpenAISDK
with read_base():
# 选择一个数据集列表
from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
livestembench_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as qwen2_5_7b_instruct_lmdeploy_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as qwen2_5_72b_instruct_lmdeploy_model
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
models = [
*qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
]
# Judge 模型配置
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
judge_cfg = dict(
abbr='qwen2-5-72b-instruct',
type=OpenAISDK,
path='YOUR_SERVER_MODEL_NAME', # 你的部署的模型名称
key='None',
openai_api_base=[
'http://localhost:23333/v1', # 你的模型部署的地址
],
meta_template=api_meta_template,
query_per_second=16,
batch_size=16,
temperature=0.001,
max_completion_tokens=32768,
)
for dataset in datasets:
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask),
),
)
work_dir = './outputs/livestembench'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import (
piqa_datasets, siqa_datasets)
from opencompass.configs.models.llama.llama2_7b import models
datasets = [*piqa_datasets, *siqa_datasets]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment