Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.lawbench.lawbench_one_shot_gen_002588 import \
lawbench_datasets as lawbench_one_shot_datasets
from opencompass.configs.datasets.lawbench.lawbench_zero_shot_gen_002588 import \
lawbench_datasets as lawbench_zero_shot_datasets
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
from opencompass.configs.summarizers.lawbench import summarizer
datasets = lawbench_zero_shot_datasets + lawbench_one_shot_datasets
for d in datasets:
d['infer_cfg']['inferencer']['save_every'] = 1
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
from opencompass.configs.summarizers.rewardbench import summarizer
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_rewardbench_datasets]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/rewardbench/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_rmb_dataset]
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]
infer = dict(
# partitioner=dict(type=NaivePartitioner),
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)
work_dir = './outputs/rmb/'
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
with read_base():
from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE
from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE
from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah
from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA
from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
models as internlm2_5_7b_chat_1m,
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
models as llama3_8b_instruct_model,
)
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as qwen2_7b_instruct_model,
)
from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
import_datasets = sum(
[niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])
# Evaluation config
NUM_SAMPLES = 500
# Change the context lengths to be tested
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
abbr_suffixs = ['4k', '8k', '16k', '32k']
work_dir = './outputs/ruler'
# Model Settings
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
model_settings = [
[qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
[llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
[internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
]
# Dataset Model Combination
datasets = []
models = []
model_dataset_combinations = []
# Different seq length
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
for model, model_path in model_settings:
_tmp_datasets = []
for dataset in import_datasets:
tmp_dataset = dataset.deepcopy()
tmp_dataset['tokenizer_model'] = model_path
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
_tmp_datasets.append(tmp_dataset)
model_dataset_combinations.append(
dict(models=[model], datasets=_tmp_datasets))
models.append(model)
datasets.extend(_tmp_datasets)
infer = dict(
partitioner=dict(type=NumWorkerPartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
retry=5),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLEvalTask)),
)
summarizer = dict(
dataset_abbrs=abbr_suffixs,
summary_groups=sum([ruler_summary_groups], []),
)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
# --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
# 4k - naive_average gen 93.66 93.48 91.20
# 8k - naive_average gen 88.38 89.95 89.07
# 16k - naive_average gen 84.27 0.14 87.61
# 32k - naive_average gen 81.36 0.00 84.59
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
with read_base():
from opencompass.configs.datasets.ruler.ruler_combined_gen import \
ruler_combined_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as internlm2_5_7b_chat_1m
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = internlm2_5_7b_chat_1m
work_dir = './outputs/ruler'
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
retry=5),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLEvalTask)),
)
summarizer = dict(
dataset_abbrs=['ruler_4k', 'ruler_8k', 'ruler_16k', 'ruler_32k'],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.collections.base_medium_llama import \
datasets
from opencompass.configs.models.rwkv.rwkv5_3b import models
from opencompass.configs.summarizers.leaderboard import summarizer
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.SimpleQA.simpleqa_gen import \
simpleqa_datasets
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
models as gpt_4o_2024_05_13_model
models = gpt_4o_2024_05_13_model # model for generation
judge_models = gpt_4o_2024_05_13_model # model for evaluation
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
summarizer = dict(type=DefaultSubjectiveSummarizer)
# -------------Inferen Stage ----------------------------------------
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=256,
task=dict(type=SubjectiveEvalTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
from opencompass.configs.datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
from opencompass.configs.datasets.subjective.fofo.fofo_judge import fofo_datasets
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=
True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [
*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets,
*compassarena_datasets, *compassbench_datasets, *fofo_datasets,
*mtbench_datasets, *mtbench101_datasets, *wildbench_datasets
] # add datasets you want
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key=
'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
work_dir = 'outputs/subjective/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3)
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import AlpacaSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
# To run this config, please ensure to successfully installed `alpaca-eval==0.6` and `scikit-learn==1.5`
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(do_sample=True, ),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*alpacav2]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
gpt4_judge = dict(
abbr='GPT4-Turbo',
path='gpt-4-1106-preview',
key=
'', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
config='weighted_alpaca_eval_gpt4_turbo')
## ------------- Evaluation Configuration
eval = dict(partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
))
work_dir = 'outputs/alpaca/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
alpacav2_datasets, )
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_bradleyterry import (
arenahard_datasets, )
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
compassarena_datasets, )
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
wildbench_datasets, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat, )
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct, )
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
models as lmdeploy_qwen2_5_14b_instruct, )
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct, )
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
HuggingFaceChatGLM3, OpenAI,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_num_worker import \
SubjectiveNumWorkerPartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import (CompassArenaBradleyTerrySummarizer,
SubjectiveSummarizer)
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
*lmdeploy_internlm2_5_7b_chat,
*lmdeploy_internlm2_5_20b_chat,
*lmdeploy_qwen2_5_14b_instruct,
*lmdeploy_qwen2_5_7b_instruct,
*lmdeploy_qwen2_7b_instruct,
]
datasets = [
*alpacav2_datasets,
*arenahard_datasets,
*compassarena_datasets,
*wildbench_datasets,
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='CompassJudger-1-32B-Instruct',
path='opencompass/CompassJudger-1-32B-Instruct',
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=4),
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
## ------------- Summary Configuration
# This step fits a Bradley-Terry model (statistical model) with an option
# to include style features and control variables based on groups
# (group variables must be available in the input dataset for each observation).
summarizer = dict(
type=CompassArenaBradleyTerrySummarizer,
rating_system='bradleyterry',
report_pred_win_rates=True,
num_bootstrap=100,
num_cpu=None,
with_control_vars=True,
normalize_style_features=False,
odds_ratio=True,
)
work_dir = 'outputs/subjective/bradleyterry'
from copy import deepcopy
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
teval_datasets as teval_en_datasets
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
teval_datasets as teval_zh_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as hf_internlm2_chat_7b_model
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
models as hf_llama2_7b_chat_model
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_model
from opencompass.configs.summarizers.teval import summarizer
meta_template_system_patches = {
'internlm2-chat-7b-hf':
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
'internlm2-chat-20b-hf':
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
}
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if all(r['role'].upper() != 'SYSTEM'
for r in round): # no system round
if m['abbr'] in meta_template_system_patches:
system_round = meta_template_system_patches[m['abbr']]
else:
system_round = [
r for r in round if r['role'].upper() == 'HUMAN'
][0]
system_round = deepcopy(system_round)
system_round['role'] = 'SYSTEM'
m['meta_template']['round'].append(system_round)
else:
raise ValueError(f'no meta_template.round in {m.get("abbr", None)}')
print(
f'model {m["abbr"]} is using the following meta_template: {m["meta_template"]}'
)
models.append(m)
datasets = teval_en_datasets + teval_zh_datasets
work_dir = './outputs/teval'
"""Dataset version metric mode
qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf.
------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
teval - naive_average unknown 57.69 78.18 36.63
teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
teval-instruct_v1 10482d json_metric unknown 94.32 97.08 0.15
teval-plan_str_v1 10482d f1_score unknown 66.24 84.12 45.72
teval-plan_json_v1 10482d f1_score unknown 63.62 77.71 19.95
teval-reason_str_v1 10482d thought unknown 54.14 63.58 44.92
teval-reason_retrieve_understand_json_v1 10482d thought unknown 33.77 54.72 21.49
teval-retrieve_str_v1 10482d name unknown 73.89 85.28 60.6
teval-reason_retrieve_understand_json_v1 10482d name unknown 31.15 68.97 15.34
teval-understand_str_v1 10482d args unknown 77.76 93.03 65.61
teval-reason_retrieve_understand_json_v1 10482d args unknown 44.16 72.23 26.84
teval-review_str_v1 10482d review_quality unknown 62.22 71.66 44.35
teval_zh - naive_average unknown 61.31 75.01 32.33
teval-instruct_v1_zh 10482d string_metric unknown 88.69 98.19 23.64
teval-instruct_v1_zh 10482d json_metric unknown 75.77 96.62 0.89
teval-plan_str_v1_zh 10482d f1_score unknown 62.43 70.69 47.82
teval-plan_json_v1_zh 10482d f1_score unknown 61.46 68.95 15.87
teval-reason_str_v1_zh 10482d thought unknown 59.43 68.14 46.96
teval-reason_retrieve_understand_json_v1_zh 10482d thought unknown 39.19 60.37 23.91
teval-retrieve_str_v1_zh 10482d name unknown 69.41 84.22 54.44
teval-reason_retrieve_understand_json_v1_zh 10482d name unknown 32.87 70.46 14.16
teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
"""
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets as chat_ceval_datasets
from opencompass.configs.datasets.ceval.ceval_ppl_578f8d import \
ceval_datasets as base_ceval_datasets
from opencompass.configs.internal.clusters.slurm import eval, infer
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b_base_models
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_models
# from opencompass.configs.internal.clusters.slurm import infer_split as infer, eval
# from opencompass.configs.internal.clusters.slurm import infer_size as infer, eval
# from opencompass.configs.internal.clusters.slurm import infer_size_split as infer, eval
base_ceval_datasets = base_ceval_datasets[:1]
chat_ceval_datasets = chat_ceval_datasets[-1:]
# If you do not want to run all the combinations of models and datasets, you
# can specify the combinations you want to run here. This is useful when you
# deleberately want to skip some subset of the combinations.
# Models and datasets in different combinations are recommended to be disjoint
# (different `abbr` in model & dataset configs), as we haven't tested this case
# throughly.
model_dataset_combinations = [
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
]
# This union of models and datasets in model_dataset_combinations should be
# stored in the `models` and `datasets` variables below. Otherwise, modules
# like summarizer will miss out some information.
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
work_dir = './outputs/default/mdcomb/'
"""
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
---------------------- --------- -------- ------ ------------ -----------------
ceval-computer_network 9b9417 accuracy ppl 52.63 -
ceval-physician 6e277d accuracy gen - 59.18
"""
__version__ = '0.5.1'
# flake8: noqa
# yapf: disable
import argparse
import copy
import getpass
import os
import os.path as osp
from datetime import datetime
from mmengine.config import Config, DictAction
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
from opencompass.runners import SlurmRunner
from opencompass.summarizers import DefaultSummarizer
from opencompass.utils import (LarkReporter, get_logger, pretty_print_config,
read_from_station, save_to_station)
from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
get_config_from_arg)
def parse_args():
parser = argparse.ArgumentParser(description='Run an evaluation task')
parser.add_argument('config', nargs='?', help='Train config file path')
# add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
# if "infer" or "eval" not specified
launch_method = parser.add_mutually_exclusive_group()
launch_method.add_argument('--slurm',
action='store_true',
default=False,
help='Whether to force tasks to run with srun. '
'If True, `--partition(-p)` must be set. '
'Defaults to False')
launch_method.add_argument('--dlc',
action='store_true',
default=False,
help='Whether to force tasks to run on dlc. If '
'True, `--aliyun-cfg` must be set. Defaults'
' to False')
# Add shortcut parameters (models, datasets and summarizer)
parser.add_argument('--models', nargs='+', help='', default=None)
parser.add_argument('--datasets', nargs='+', help='', default=None)
parser.add_argument('--summarizer', help='', default=None)
# add general args
parser.add_argument('--debug',
help='Debug mode, in which scheduler will run tasks '
'in the single process, and output will not be '
'redirected to files',
action='store_true',
default=False)
parser.add_argument('--dry-run',
help='Dry run mode, in which the scheduler will not '
'actually run the tasks, but only print the commands '
'to run',
action='store_true',
default=False)
parser.add_argument(
'-a', '--accelerator',
help='Infer accelerator, support vllm and lmdeploy now.',
choices=['vllm', 'lmdeploy', None],
default=None,
type=str)
parser.add_argument('-m',
'--mode',
help='Running mode. You can choose "infer" if you '
'only want the inference results, or "eval" if you '
'already have the results and want to evaluate them, '
'or "viz" if you want to visualize the results.',
choices=['all', 'infer', 'eval', 'viz'],
default='all',
type=str)
parser.add_argument('-r',
'--reuse',
nargs='?',
type=str,
const='latest',
help='Reuse previous outputs & results, and run any '
'missing jobs presented in the config. If its '
'argument is not specified, the latest results in '
'the work_dir will be reused. The argument should '
'also be a specific timestamp, e.g. 20230516_144254')
parser.add_argument('-w',
'--work-dir',
help='Work path, all the outputs will be '
'saved in this path, including the slurm logs, '
'the evaluation results, the summary results, etc.'
'If not specified, the work_dir will be set to '
'outputs/default.',
default=None,
type=str)
parser.add_argument(
'--config-dir',
default='configs',
help='Use the custom config directory instead of config/ to '
'search the configs for datasets, models and summarizers',
type=str)
parser.add_argument(
'--config-verbose',
default=False,
action='store_true',
help='Whether to print the config in verbose mode.')
parser.add_argument('-l',
'--lark',
help='Report the running status to lark bot',
action='store_true',
default=False)
parser.add_argument('--max-num-workers',
help='Max number of workers to run in parallel. '
'Will be overrideen by the "max_num_workers" argument '
'in the config.',
type=int,
default=1)
parser.add_argument('--max-workers-per-gpu',
help='Max task to run in parallel on one GPU. '
'It will only be used in the local runner.',
type=int,
default=1)
parser.add_argument(
'--retry',
help='Number of retries if the job failed when using slurm or dlc. '
'Will be overrideen by the "retry" argument in the config.',
type=int,
default=2)
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc. Defaults to True.',
nargs='?',
const=True,
default=True,
type=lambda x: False if x and x.lower() == 'false' else True
)
parser.add_argument(
'--dump-extract-rate',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
)
# for the results persistence
parser.add_argument('-sp',
'--station-path',
help='Path to your results station.',
type=str,
default=None,
)
parser.add_argument('--station-overwrite',
help='Whether to overwrite the results at station.',
action='store_true',
)
parser.add_argument(
'--read-from-station',
help='Whether to save the evaluation results to the '
'data station.',
action='store_true',
)
# for evaluation with multiple runs
parser.add_argument('--dataset-num-runs',
help='How many runs for one dataset',
type=int,
default=1,
)
# set srun args
slurm_parser = parser.add_argument_group('slurm_args')
parse_slurm_args(slurm_parser)
# set dlc args
dlc_parser = parser.add_argument_group('dlc_args')
parse_dlc_args(dlc_parser)
# set hf args
hf_parser = parser.add_argument_group('hf_args')
parse_hf_args(hf_parser)
# set custom dataset args
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
parse_custom_dataset_args(custom_dataset_parser)
args = parser.parse_args()
if args.slurm:
assert args.partition is not None, (
'--partition(-p) must be set if you want to use slurm')
if args.dlc:
assert os.path.exists(args.aliyun_cfg), (
'When launching tasks using dlc, it needs to be configured '
'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
' to specify a new path.')
return args
def parse_slurm_args(slurm_parser):
"""These args are all for slurm launch."""
slurm_parser.add_argument('-p',
'--partition',
help='Slurm partition name',
default=None,
type=str)
slurm_parser.add_argument('-q',
'--quotatype',
help='Slurm quota type',
default=None,
type=str)
slurm_parser.add_argument('--qos',
help='Slurm quality of service',
default=None,
type=str)
def parse_dlc_args(dlc_parser):
"""These args are all for dlc launch."""
dlc_parser.add_argument('--aliyun-cfg',
help='The config path for aliyun config',
default='~/.aliyun.cfg',
type=str)
def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead')
hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli')
hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
def parse_custom_dataset_args(custom_dataset_parser):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-data-type',
type=str,
choices=['mcq', 'qa'])
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
type=str,
choices=['gen', 'ppl'])
def main():
args = parse_args()
if args.num_gpus is not None:
raise ValueError('The `--num-gpus` argument is deprecated, please use '
'`--hf-num-gpus` to describe number of gpus used for '
'the HuggingFace model instead.')
if args.dry_run:
args.debug = True
# initialize logger
logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
cfg = get_config_from_arg(args)
if args.work_dir is not None:
cfg['work_dir'] = args.work_dir
else:
cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
# cfg_time_str defaults to the current time
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
if args.reuse:
if args.reuse == 'latest':
if not os.path.exists(cfg.work_dir) or not os.listdir(
cfg.work_dir):
logger.warning('No previous results to reuse!')
else:
dirs = os.listdir(cfg.work_dir)
dir_time_str = sorted(dirs)[-1]
else:
dir_time_str = args.reuse
logger.info(f'Reusing experiements from {dir_time_str}')
elif args.mode in ['eval', 'viz'] and not args.read_from_station:
raise ValueError(
'You must specify -r or --reuse, or you have to specify '
'--read-from-station and --station-path when running in eval '
'or viz mode!')
# update "actual" work_dir
cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
current_workdir = cfg['work_dir']
logger.info(f'Current exp folder: {current_workdir}')
os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
# dump config
output_config_path = osp.join(cfg.work_dir, 'configs',
f'{cfg_time_str}_{os.getpid()}.py')
cfg.dump(output_config_path)
# Config is intentally reloaded here to avoid initialized
# types cannot be serialized
cfg = Config.fromfile(output_config_path, format_python_code=False)
# get existed results from station
if args.read_from_station:
existing_results_list = read_from_station(cfg, args)
rs_exist_results = [comb['combination'] for comb in existing_results_list]
cfg['rs_exist_results'] = rs_exist_results
# report to lark bot if specify --lark
if not args.lark:
cfg['lark_bot_url'] = None
elif cfg.get('lark_bot_url', None):
content = f'{getpass.getuser()}\'s task has been launched!'
LarkReporter(cfg['lark_bot_url']).post(content)
# print config if specified --config-verbose
if args.config_verbose:
pretty_print_config(cfg)
# infer
if args.mode in ['all', 'infer']:
# When user have specified --slurm or --dlc, or have not set
# "infer" in config, we will provide a default configuration
# for infer
if (args.dlc or args.slurm) and cfg.get('infer', None):
logger.warning('You have set "infer" in the config, but '
'also specified --slurm or --dlc. '
'The "infer" configuration will be overridden by '
'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('infer', None) is None:
fill_infer_cfg(cfg, args)
if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.infer.runner.partition = args.partition
cfg.infer.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.infer.runner.debug = True
if args.lark:
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'predictions/')
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.infer.runner)
# Add extra attack config if exists
if hasattr(cfg, 'attack'):
for task in tasks:
cfg.attack.dataset = task.datasets[0][0].abbr
task.attack = cfg.attack
runner(tasks)
# evaluate
if args.mode in ['all', 'eval']:
# When user have specified --slurm or --dlc, or have not set
# "eval" in config, we will provide a default configuration
# for eval
if (args.dlc or args.slurm) and cfg.get('eval', None):
logger.warning('You have set "eval" in the config, but '
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
logger.warning('Default to dump eval details, it might take extra'
'space to save all the evaluation details. '
'Set --dump-eval-details False to skip the details dump')
cfg.eval.runner.task.dump_details = True
if args.dump_extract_rate:
cfg.eval.runner.task.cal_extract_rate = True
if args.partition is not None:
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
cfg.eval.runner.partition = args.partition
cfg.eval.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.eval.runner.debug = True
if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.eval.runner)
# For meta-review-judge in subjective evaluation
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
for task_part in tasks:
runner(task_part)
else:
runner(tasks)
# save to station
if args.station_path is not None or cfg.get('station_path') is not None:
save_to_station(cfg, args)
# visualize
if args.mode in ['all', 'eval', 'viz']:
summarizer_cfg = cfg.get('summarizer', {})
# For subjective summarizer
if summarizer_cfg.get('function', None):
main_summarizer_cfg = copy.deepcopy(summarizer_cfg)
grouped_datasets = {}
for dataset in cfg.datasets:
prefix = dataset['abbr'].split('_')[0]
if prefix not in grouped_datasets:
grouped_datasets[prefix] = []
grouped_datasets[prefix].append(dataset)
all_grouped_lists = []
for prefix in grouped_datasets:
all_grouped_lists.append(grouped_datasets[prefix])
dataset_score_container = []
for dataset in all_grouped_lists:
temp_cfg = copy.deepcopy(cfg)
temp_cfg.datasets = dataset
summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
summarizer = build_from_cfg(summarizer_cfg)
dataset_score = summarizer.summarize(time_str=cfg_time_str)
if dataset_score:
dataset_score_container.append(dataset_score)
main_summarizer_cfg['config'] = cfg
main_summarizer = build_from_cfg(main_summarizer_cfg)
main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
else:
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
summarizer_cfg['type'] = DefaultSummarizer
summarizer_cfg['config'] = cfg
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=cfg_time_str)
if __name__ == '__main__':
main()
datasets = [
dict(
abbr='CPsyExam',
path='./data/CPsyExam/merged_train_dev.jsonl',
evaluator=dict(
type='llm_evaluator',
judge_cfg=dict(),
),
n=1,
),
]
\ No newline at end of file
subset_list = [
'test',
'valid',
]
language_list = [
'CN',
'EN',
]
datasets = []
for subset in subset_list:
for language in language_list:
datasets.append(
dict(
abbr=f'CS-Bench_{language}_{subset}',
path=f'./data/csbench/CSBench-{language}/{subset}.jsonl',
evaluator=dict(
type='llm_evaluator',
judge_cfg=dict(),
),
)
)
\ No newline at end of file
datasets = [
dict(
abbr='C-MHChem',
path='./data/C-MHChem2.jsonl',
evaluator=dict(
type='llm_evaluator',
judge_cfg=dict(),
),
n=1,
),
]
\ No newline at end of file
datasets = [
dict(
abbr='MaScQA',
path='./data/MaScQA/MaScQA.jsonl',
evaluator=dict(
type='llm_evaluator',
judge_cfg=dict(),
),
n=1,
),
]
\ No newline at end of file
subset_list = [
'AtomicPhysics',
'ClassicalElectromagnetism',
'ClassicalMechanics',
'Electrodynamics',
'GeometricalOptics',
'QuantumMechanics',
'Relativity',
'Solid-StatePhysics',
'StatisticalMechanics',
'SemiconductorPhysics',
'Thermodynamics',
'TheoreticalMechanics',
'WaveOptics',
]
language_list = [
'zh',
'en',
]
datasets = []
for subset in subset_list:
for language in language_list:
datasets.append(
dict(
abbr=f'UGPhysics_{subset}_{language}',
path=f'./data/ugphysics/{subset}/{language}.jsonl',
evaluator=dict(
type='llm_evaluator',
judge_cfg=dict(),
),
)
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment