Commit be3dfa50 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2876 failed with stages
in 0 seconds
from lagent import ReAct
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.models.lagent import LagentAgent
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_agent_gen_af2293 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import \
mathbench_agent_datasets
from opencompass.configs.summarizers.math_agent import summarizer
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
datasets += mathbench_agent_datasets
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
```
def solution():
variable_names_with_real_meaning = func(variable)
return variable_names_with_real_meaning
```"""
protocol = dict(
type=ReActProtocol,
action=dict(role='ACTION', begin='Tool:', end='\n'),
action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
finish=dict(role='FINISH', begin='FinalAnswer:', end='\n'),
call_protocol=system_prompt,
)
models = [
dict(
abbr='gpt-3.5-react',
type=LagentAgent,
agent_type=ReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(type=PythonInterpreter),
],
protocol=protocol,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
from opencompass.configs.datasets.MathBench.mathbench_gen import \
mathbench_datasets
from opencompass.configs.summarizers.math_baseline import summarizer
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
datasets += mathbench_datasets
models = [
dict(
abbr='gpt-3.5-react',
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_chat_gen import \
math_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import \
models as hf_internlm2_chat_1_8b_models
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
models as hf_qwen2_1_5b_instruct_models
datasets = gsm8k_datasets + math_datasets
models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAI
from opencompass.openicl import ChatInferencer
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets as datasets
models = [
dict(
abbr='gpt-3.5',
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
for dataset in datasets:
# Use ChatInferencer instead of GenInferencer
dataset['infer_cfg']['inferencer'] = dict(type=ChatInferencer)
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ChemBench.ChemBench_gen import \
chembench_datasets
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models
datasets = [*chembench_datasets]
models = [*models]
'''
dataset version metric mode mistral-7b-instruct-v0.2-hf
-------------------------------- --------- -------- ------ -----------------------------
ChemBench_Name_Conversion d4e6a1 accuracy gen 45.43
ChemBench_Property_Prediction d4e6a1 accuracy gen 47.11
ChemBench_Mol2caption d4e6a1 accuracy gen 64.21
ChemBench_Caption2mol d4e6a1 accuracy gen 35.38
ChemBench_Product_Prediction d4e6a1 accuracy gen 38.67
ChemBench_Retrosynthesis d4e6a1 accuracy gen 27
ChemBench_Yield_Prediction d4e6a1 accuracy gen 27
ChemBench_Temperature_Prediction d4e6a1 accuracy gen 26.73
ChemBench_Solvent_Prediction d4e6a1 accuracy gen 32.67
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Inference Stage ----------------------------------------
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='Qwen2.5-1.5B-Instruct',
path='Qwen/Qwen2.5-1.5B-Instruct',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(do_sample=True, ),
max_out_len=200,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
summarizer = dict(type=DefaultSubjectiveSummarizer)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
api_meta_template = dict(round=[
dict(role='SYSTEM', api_role='SYSTEM'),
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
# GPT4o
abbr='gpt-4o-0513-global',
type=OpenAI,
# gpt-4o
path='gpt-4o-0513-global',
key='xxx', # provide OPENAI_API_KEY
meta_template=api_meta_template,
query_per_second=16,
max_out_len=1000,
batch_size=8,
retry=3)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
work_dir = 'outputs/chinese_simpleqa/'
from copy import deepcopy
from lagent import ReAct
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models import HuggingFaceCausalLM
from opencompass.models.lagent import CodeAgent, LagentAgent
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# Note that it might occur cuda OOM error for hf model
from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
cibench_datasets as cibench_datasets_generation
from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
cibench_datasets as cibench_datasets_template
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.summarizers.cibench import summarizer
# Oracle mode for analysis
# from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
# from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
datasets = []
datasets += cibench_datasets_template
datasets += cibench_datasets_generation
# datasets += cibench_datasets_template_oracle
# datasets += cibench_datasets_generation_oracle
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
actions = [
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
]
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
work_dir = './outputs/cibench/'
_agent_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if all(r['role'].upper() != 'SYSTEM'
for r in round): # no system round
if not any('api_role' in r for r in round):
m['meta_template']['round'].append(
dict(role='system', begin='System response:', end='\n'))
else:
m['meta_template']['round'].append(
dict(role='system', api_role='SYSTEM'))
print(
f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}'
)
_agent_models.append(m)
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
models = []
for m in _agent_models:
m = deepcopy(m)
origin_abbr = m.pop('abbr')
abbr = origin_abbr
m.pop('batch_size', None)
m.pop('max_out_len', None)
m.pop('max_seq_len', None)
run_cfg = m.pop('run_cfg', {})
agent_model = dict(
abbr=abbr,
summarizer_abbr=origin_abbr,
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=m,
actions=[
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
],
protocol=protocol,
batch_size=1,
run_cfg=run_cfg,
)
models.append(agent_model)
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models import OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
cibench_datasets as cibench_datasets_generation
from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
cibench_datasets as cibench_datasets_template
# Oracle mode for analysis
# from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
# from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
from opencompass.configs.summarizers.cibench import summarizer
datasets = []
datasets += cibench_datasets_template
datasets += cibench_datasets_generation
# datasets += cibench_datasets_template_oracle
# datasets += cibench_datasets_generation_oracle
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
dict(role='SYSTEM', api_role='SYSTEM'),
], )
actions = [
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
]
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
work_dir = 'outputs/cibench/'
models = [
dict(
abbr='gpt-4o',
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-4o',
rpm_verbose=True,
retry=99,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=2048,
temperature=0,
),
actions=actions,
protocol=protocol,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.datasets.circular import (
CircularARCDataset, CircularCEvalDataset, CircularCMMLUDataset,
CircularCSQADataset, CircularEvaluator, CircularHSWAGDataset,
CircularMMLUDataset, CircularOBQADataset, CircularRaceDataset)
from opencompass.summarizers import CircularSummarizer
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import \
ARC_c_datasets
from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import \
ARC_e_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
cmmlu_datasets
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import \
commonsenseqa_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import \
hellaswag_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
models as hf_internlm_chat_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
models as hf_internlm_chat_20b_model
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_model
from opencompass.configs.models.qwen.hf_qwen_14b_chat import \
models as hf_qwen_14b_chat_model
from opencompass.configs.summarizers.groups.ceval import \
ceval_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
for ds, t in [
(ceval_datasets, CircularCEvalDataset),
(mmlu_datasets, CircularMMLUDataset),
(cmmlu_datasets, CircularCMMLUDataset),
(hellaswag_datasets, CircularHSWAGDataset),
(ARC_e_datasets, CircularARCDataset),
(ARC_c_datasets, CircularARCDataset),
(commonsenseqa_datasets, CircularCSQADataset),
(obqa_datasets, CircularOBQADataset),
(race_datasets, CircularRaceDataset),
]:
for d in ds:
d['type'] = t
d['abbr'] = d['abbr'] + '-circular-4'
d['eval_cfg']['evaluator'] = {
'type': CircularEvaluator,
'circular_pattern': 'circular'
}
d['circular_patterns'] = 'circular'
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# config summarizer
other_summary_groups = [
{
'name':
'average',
'subsets': [
'ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c',
'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high'
]
},
]
origin_summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
new_summary_groups = []
for item in origin_summary_groups:
new_summary_groups.append({
'name':
item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
})
summarizer = dict(
type=CircularSummarizer,
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs=[
'average-circular-4',
'ceval-circular-4',
'mmlu-circular-4',
'cmmlu-circular-4',
'hellaswag-circular-4',
'ARC-e-circular-4',
'ARC-c-circular-4',
'commonsense_qa-circular-4',
'openbookqa_fact-circular-4',
'race-middle-circular-4',
'race-high-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
'mmlu-humanities-circular-4',
'mmlu-stem-circular-4',
'mmlu-social-science-circular-4',
'mmlu-other-circular-4',
'cmmlu-humanities-circular-4',
'cmmlu-stem-circular-4',
'cmmlu-social-science-circular-4',
'cmmlu-other-circular-4',
'cmmlu-china-specific-circular-4',
],
summary_groups=new_summary_groups,
)
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.models.claude.claude import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
# This config is used for pass@k evaluation with `num_return_sequences`
# That model can generate multiple responses for single input
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
mbpp_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
sanitized_mbpp_datasets
datasets = []
datasets += humaneval_datasets
datasets += mbpp_datasets
datasets += sanitized_mbpp_datasets
models = [
dict(
type=HuggingFaceCausalLM,
abbr='CodeLlama-7b-Python',
path='codellama/CodeLlama-7b-Python-hf',
tokenizer_path='codellama/CodeLlama-7b-Python-hf',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=1024,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
generation_kwargs=dict(
num_return_sequences=10,
do_sample=True,
top_p=0.95,
temperature=0.8,
),
run_cfg=dict(num_gpus=1, num_procs=1),
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=300),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# This config is used for pass@k evaluation with dataset repetition
# That model cannot generate multiple response for single input
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import \
mbpp_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import \
sanitized_mbpp_datasets
datasets = []
datasets += humaneval_datasets
datasets += mbpp_datasets
datasets += sanitized_mbpp_datasets
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
models = [
dict(
abbr='internlm-chat-7b-hf-v11',
type=HuggingFaceCausalLM,
path='internlm/internlm-chat-7b-v1_1',
tokenizer_path='internlm/internlm-chat-7b-v1_1',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_seq_len=2048,
meta_template=_meta_template,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
generation_kwargs=dict(
do_sample=True,
top_p=0.95,
temperature=0.8,
),
run_cfg=dict(num_gpus=1, num_procs=1),
batch_size=8,
)
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=600),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_57b0b1 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_943d32 import math_datasets
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
models = [
dict(abbr='gpt-3.5-react',
type=CodeAgent,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
batch_size=8),
dict(abbr='WizardCoder-Python-13B-V1.0-react',
type=CodeAgent,
llm=dict(
type=HuggingFaceCausalLM,
path='WizardLM/WizardCoder-Python-13B-V1.0',
tokenizer_path='WizardLM/WizardCoder-Python-13B-V1.0',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_seq_len=2048,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
),
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1)),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.humanevalx.humanevalx_gen import \
humanevalx_datasets
from opencompass.configs.models.codegeex2.hf_codegeex2_6b import models
datasets = humanevalx_datasets
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_judge import compassarena_subjectivebench_singleturn_datasets
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_judge import compassarena_subjectivebench_multiturn_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import models as lmdeploy_internlm2_5_20b_chat
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import models as lmdeploy_qwen2_5_0_5b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import models as lmdeploy_qwen2_5_1_5b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import models as lmdeploy_qwen2_5_3b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import models as lmdeploy_qwen2_5_14b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import models as lmdeploy_qwen2_5_32b_instruct
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
# models = [
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='CompassJudger-1-7B-Instruct',
# path='opencompass/CompassJudger-1-7B-Instruct',
# engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
# gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
# max_seq_len=16384,
# max_out_len=2048,
# batch_size=16,
# run_cfg=dict(num_gpus=1),
# )
# ]
models = [
*lmdeploy_qwen2_5_14b_instruct, *lmdeploy_qwen2_5_32b_instruct,
*lmdeploy_qwen2_5_7b_instruct, *lmdeploy_qwen2_7b_instruct
]
datasets = [
*compassarena_subjectivebench_singleturn_datasets,
*compassarena_subjectivebench_multiturn_datasets
] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=DefaultSubjectiveSummarizer, )
work_dir = 'outputs/subjective/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_bt_judge import (
compassarena_subjectivebench_bradleyterry_singleturn_datasets, )
from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_bt_judge import (
compassarena_subjectivebench_bradleyterry_multiturn_datasets, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat, )
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
models as lmdeploy_llama3_1_8b_instruct, )
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import (
models as lmdeploy_llama3_1_70b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import (
models as lmdeploy_qwen2_5_0_5b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import (
models as lmdeploy_qwen2_5_1_5b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import (
models as lmdeploy_qwen2_5_3b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
models as lmdeploy_qwen2_5_32b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import (
models as lmdeploy_qwen2_5_72b_instruct, )
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct, )
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import CompassArenaBradleyTerrySummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
models = [
*lmdeploy_qwen2_5_14b_instruct,
*lmdeploy_qwen2_5_32b_instruct,
*lmdeploy_qwen2_5_7b_instruct,
*lmdeploy_qwen2_7b_instruct,
]
datasets = [
*compassarena_subjectivebench_bradleyterry_singleturn_datasets,
*compassarena_subjectivebench_bradleyterry_multiturn_datasets,
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
## ------------- Summary Configuration
# This step fits a Bradley-Terry model (statistical model) with an option
# to include style features and control variables based on groups
# (group variables must be available in the input dataset for each observation).
summarizer = dict(
type=CompassArenaBradleyTerrySummarizer,
rating_system='bradleyterry',
report_pred_win_rates=True,
num_bootstrap=100,
num_cpu=None,
with_control_vars=True,
normalize_style_features=False,
odds_ratio=True,
groups=['difficulty', 'category'],
)
work_dir = 'outputs/compassarena_subjectivebench_bradleyterry/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
ARC_c_datasets
from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
ceval_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
hellaswag_datasets
from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b_model
from opencompass.configs.models.yi.hf_yi_6b import models as hf_yi_6b_model
from opencompass.configs.summarizers.contamination import summarizer
datasets = [
*ceval_datasets, *mmlu_datasets, *hellaswag_datasets, *ARC_c_datasets
]
models = [*hf_yi_6b_model, *hf_qwen_7b_model, *hf_llama2_7b_model]
import os.path as osp
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
gpqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
humaneval_datasets
# ## Math
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets
# Model List
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups
# TODO: Add LiveCodeBench
# ## Instruction Following
# from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name':
'core_average',
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
['hellaswag', 'accuracy'], ['drop', 'accuracy'],
['math', 'accuracy'], ['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
['mathbench-t (average)', 'naive_average']],
},
]
summarizer = dict(
dataset_abbrs=[
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'naive_average'],
['hellaswag', 'accuracy'],
['drop', 'accuracy'],
['math', 'accuracy'],
['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math', 'accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science', 'accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'base_objective')
import os.path as osp
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
# TODO: Add LiveCodeBench
# ## Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets
# ## Math
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name':
'core_average',
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'], ['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'], ['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'], ['hellaswag', 'accuracy'],
['mathbench-t (average)', 'naive_average']],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'],
['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'],
['hellaswag', 'accuracy'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math', 'accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science', 'accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
'',
['bbh', 'extract_rate'],
['math', 'extract_rate'],
# ['openai_humaneval', 'extract_rate'],
['GPQA_diamond', 'extract_rate'],
# ['IFEval', 'extract_rate'],
'',
['mmlu', 'extract_rate'],
['mmlu-stem', 'extract_rate'],
['mmlu-social-science', 'extract_rate'],
['mmlu-humanities', 'extract_rate'],
['mmlu-other', 'extract_rate'],
'',
['mmlu_pro', 'extract_rate'],
['mmlu_pro_math', 'extract_rate'],
['mmlu_pro_physics', 'extract_rate'],
['mmlu_pro_chemistry', 'extract_rate'],
['mmlu_pro_law', 'extract_rate'],
['mmlu_pro_engineering', 'extract_rate'],
['mmlu_pro_other', 'extract_rate'],
['mmlu_pro_economics', 'extract_rate'],
['mmlu_pro_health', 'extract_rate'],
['mmlu_pro_psychology', 'extract_rate'],
['mmlu_pro_business', 'extract_rate'],
['mmlu_pro_biology', 'extract_rate'],
['mmlu_pro_philosophy', 'extract_rate'],
['mmlu_pro_computer_science', 'extract_rate'],
['mmlu_pro_history', 'extract_rate'],
'',
['cmmlu', 'extract_rate'],
['cmmlu-stem', 'extract_rate'],
['cmmlu-social-science', 'extract_rate'],
['cmmlu-humanities', 'extract_rate'],
['cmmlu-other', 'extract_rate'],
['cmmlu-china-specific', 'extract_rate'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'chat_objective')
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
needlebench_datasets as needlebench_8k_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
needlebench_datasets as needlebench_32k_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
needlebench_datasets as needlebench_128k_datasets
from opencompass.configs.datasets.ruler.ruler_8k_gen import \
ruler_datasets as ruler_8k_datasets
from opencompass.configs.datasets.ruler.ruler_32k_gen import \
ruler_datasets as ruler_32k_datasets
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
ruler_datasets as ruler_128k_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_1m_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as llama3_1_8b_instruct_model
# Instruct models
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model
# Summary Groups
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
from opencompass.configs.summarizers.needlebench import (
needlebench_8k_summarizer, needlebench_32k_summarizer,
needlebench_128k_summarizer)
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
# Instruct models summarizer
summarizer = dict(
dataset_abbrs=[
['ruler_8k', 'naive_average'],
['ruler_32k', 'naive_average'],
['ruler_128k', 'naive_average'],
['NeedleBench-Overall-Score-8K', 'weighted_average'],
['NeedleBench-Overall-Score-32K', 'weighted_average'],
['NeedleBench-Overall-Score-128K', 'weighted_average'],
['longbench', 'naive_average'],
['longbench_zh', 'naive_average'],
['longbench_en', 'naive_average'],
'',
'longbench_single-document-qa',
'longbench_multi-document-qa',
'longbench_summarization',
'longbench_few-shot-learning',
'longbench_synthetic-tasks',
'longbench_code-completion',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'long_context')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment