Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
with read_base():
# Models
# Datasets
from opencompass.configs.datasets.babilong.babilong_0k_gen import \
babiLong_0k_datasets
from opencompass.configs.datasets.babilong.babilong_4k_gen import \
babiLong_4k_datasets
from opencompass.configs.datasets.babilong.babilong_16k_gen import \
babiLong_16k_datasets
from opencompass.configs.datasets.babilong.babilong_32k_gen import \
babiLong_32k_datasets
from opencompass.configs.datasets.babilong.babilong_128k_gen import \
babiLong_128k_datasets
from opencompass.configs.datasets.babilong.babilong_256k_gen import \
babiLong_256k_datasets
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
models as lmdeploy_ministral_8b_instruct_2410_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.summarizers.groups.babilong import \
babilong_summary_groups
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['engine_config']['session_len'] = 1024 * 1024
model['max_seq_len'] = 1024 * 1024
model['engine_config']['tp'] = 4
model['run_cfg']['num_gpus'] = 4
summarizer = dict(
dataset_abbrs=[
'babilong_0k',
'babilong_4k',
'babilong_16k',
'babilong_32k',
'babilong_128k',
'babilong_256k',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
work_dir = './outputs/babilong'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import \
gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_base_gen import \
math_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import \
models as hf_internlm2_1_8b_models
from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
models as hf_qwen2_1_5b_models
datasets = gsm8k_datasets + math_datasets
models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models
# flake8: noqa
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import (
gpqa_datasets,
)
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
mmlu_pro_datasets,
)
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
ifeval_datasets,
)
from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import (
smolinstruct_datasets_0shot_instruct as smolinstruct_datasets,
)
from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import (
chembench_datasets,
)
from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import (
matbench_datasets,
)
from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import (
proteinlmbench_datasets,
)
# Summary Groups
from opencompass.configs.summarizers.groups.mmlu_pro import (
mmlu_pro_summary_groups,
)
# Models
from opencompass.configs.models.interns1.intern_s1 import \
models as interns1_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Only take LCB generation for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
[])
# LLM judge config: using LLM to evaluate predictions
judge_cfg = dict()
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 65536
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys() and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend(
[
{
'name': 'ChemBench',
'subsets': [
'ChemBench_Name_Conversion',
'ChemBench_Property_Prediction',
'ChemBench_Mol2caption',
'ChemBench_Caption2mol',
'ChemBench_Product_Prediction',
'ChemBench_Retrosynthesis',
'ChemBench_Yield_Prediction',
'ChemBench_Temperature_Prediction',
],
},
]
)
summarizer = dict(
dataset_abbrs=[
'Knowledge',
['mmlu_pro', 'accuracy'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['GPQA_diamond', 'accuracy'],
'',
'Math Calculation',
['aime2025', 'accuracy'],
'',
'Academic',
['ChemBench', 'naive_average'],
['ProteinLMBench', 'accuracy'],
'',
'SmolInstruct',
['NC-I2F-0shot-instruct', 'score'],
['NC-I2S-0shot-instruct', 'score'],
['NC-S2F-0shot-instruct', 'score'],
['NC-S2I-0shot-instruct', 'score'],
['PP-ESOL-0shot-instruct', 'score'],
['PP-Lipo-0shot-instruct', 'score'],
['PP-BBBP-0shot-instruct', 'accuracy'],
['PP-ClinTox-0shot-instruct', 'accuracy'],
['PP-HIV-0shot-instruct', 'accuracy'],
['PP-SIDER-0shot-instruct', 'accuracy'],
['MC-0shot-instruct', 'score'],
['MG-0shot-instruct', 'score'],
['FS-0shot-instruct', 'score'],
['RS-0shot-instruct', 'score'],
'',
['matbench_expt_gap', 'mae'],
['matbench_steels', 'mae'],
['matbench_expt_is_metal', 'accuracy'],
['matbench_glass', 'accuracy'],
'',
],
summary_groups=summary_groups,
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# infer with local runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask),
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
work_dir = './outputs/oc_bench_intern_s1'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.lveval.lveval import \
LVEval_datasets as datasets
from opencompass.configs.models.bluelm.hf_bluelm_7b_chat_32k import models
from opencompass.configs.summarizers.lveval import summarizer
models[0]['path'] = '/path/to/your/huggingface_models/BlueLM-7B-Chat-32K'
models[0][
'tokenizer_path'] = '/path/to/your/huggingface_models/BlueLM-7B-Chat-32K'
models[0]['max_seq_len'] = 32768
models[0]['generation_kwargs'] = dict(do_sample=False)
models[0]['mode'] = 'mid' # truncate in the middle
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import (
GenericLLMEvaluator,
CascadeEvaluator,
MATHVerifyEvaluator,
)
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import (
MATHDataset,
math_postprocess_v2,
normalize_final_answer,
)
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets, Summarizer
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
reader_cfg = dict(input_columns=['problem'], output_column='solution')
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
########################## Evaluator #################################
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
llm_judge_evaluator = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
),
judge_cfg=dict(),
)
rule_evaluator =dict(type=MATHVerifyEvaluator)
cascade_evaluator = dict(type=CascadeEvaluator,
llm_evaluator=llm_judge_evaluator,
rule_evaluator=rule_evaluator,
parallel=False
)
########################## #################################
eval_cfg = dict()
# eval_cfg['evaluator'] = rule_evaluator
# eval_cfg['evaluator'] = llm_judge_evaluator
eval_cfg['evaluator'] = cascade_evaluator
math_datasets = [
dict(
abbr='math_prm800k_500',
type=MATHDataset,
path='opencompass/math',
file_name='test_prm800k_500.json',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
]
datasets = math_datasets
models = lmdeploy_qwen2_5_7b_instruct_model
work_dir = 'math_prm800k_500_cascade_evaluator'
\ No newline at end of file
from mmengine.config import read_base
from opencompassopencompass.configs.models import OpenAI
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import CharmMemSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
from opencompass.configs.datasets.CHARM.charm_memory_gen_bbbd53 import \
charm_memory_datasets as datasets
# ------>>>>>> https://arxiv.org/abs/2403.14112
# from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# <<<<<<------ https://arxiv.org/abs/2403.14112
# from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
## ------------- JudgeLLM Configuration
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
abbr='GPT-3.5-turbo-0125',
type=OpenAI,
path='gpt-3.5-turbo-0125',
key='ENV',
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=1000,
mode='singlescore',
models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner,
max_num_workers=2,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=CharmMemSummarizer)
work_dir = './outputs/CHARM_mem/chat/'
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import \
charm_reason_datasets as datasets
# ------>>>>>> https://arxiv.org/abs/2403.14112
# from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
# from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
# from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
# from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
# from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
# from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
# from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
# from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
# from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
# <<<<<<------ https://arxiv.org/abs/2403.14112
# from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
# from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
# from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
# from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
# from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
# from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
from .summarizers.charm_reason import summarizer
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
work_dir = './outputs/CHARM_rea/chat/'
# dataset version metric mode internlm2-chat-7b-turbomind
# ------------------------------------------------------------- --------- ------------- ------ -----------------------------
# charm-reason-Direct - naive_average gen 49.51
# charm-reason-ZH-CoT - naive_average gen 61.33
# charm-reason-EN-CoT - naive_average gen 54.55
# charm-reason-XLT - naive_average gen 58.46
# charm-reason-Translate-EN - naive_average gen 56.15
# - - - -
# charm-reason-Chinese_Direct - naive_average gen 47.14
# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
# charm-reason-Chinese_EN-CoT - naive_average gen 48.31
# charm-reason-Chinese_XLT - naive_average gen 53.57
# charm-reason-Chinese_Translate-EN - naive_average gen 48.21
# charm-reason-Global_Direct - naive_average gen 51.88
# charm-reason-Global_ZH-CoT - naive_average gen 64.26
# charm-reason-Global_EN-CoT - naive_average gen 60.79
# charm-reason-Global_XLT - naive_average gen 63.36
# charm-reason-Global_Translate-EN - naive_average gen 64.10
from lagent import ReAct
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.models.lagent import LagentAgent
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_agent_gen_af2293 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import \
mathbench_agent_datasets
from opencompass.configs.summarizers.math_agent import summarizer
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
datasets += mathbench_agent_datasets
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
```
def solution():
variable_names_with_real_meaning = func(variable)
return variable_names_with_real_meaning
```"""
protocol = dict(
type=ReActProtocol,
action=dict(role='ACTION', begin='Tool:', end='\n'),
action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
finish=dict(role='FINISH', begin='FinalAnswer:', end='\n'),
call_protocol=system_prompt,
)
models = [
dict(
abbr='gpt-3.5-react',
type=LagentAgent,
agent_type=ReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(type=PythonInterpreter),
],
protocol=protocol,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
gsm8k_datasets
from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
from opencompass.configs.datasets.MathBench.mathbench_gen import \
mathbench_datasets
from opencompass.configs.summarizers.math_baseline import summarizer
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
datasets += mathbench_datasets
models = [
dict(
abbr='gpt-3.5-react',
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
gsm8k_datasets
from opencompass.configs.datasets.demo.demo_math_chat_gen import \
math_datasets
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import \
models as hf_internlm2_chat_1_8b_models
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
models as hf_qwen2_1_5b_instruct_models
datasets = gsm8k_datasets + math_datasets
models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAI
from opencompass.openicl import ChatInferencer
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets as datasets
models = [
dict(
abbr='gpt-3.5',
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
for dataset in datasets:
# Use ChatInferencer instead of GenInferencer
dataset['infer_cfg']['inferencer'] = dict(type=ChatInferencer)
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# flake8: noqa
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Models (add your models here)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model
# Datasets
from opencompass.configs.chatml_datasets.MaScQA.MaScQA_gen import datasets as MaScQA_chatml
from opencompass.configs.chatml_datasets.CPsyExam.CPsyExam_gen import datasets as CPsyExam_chatml
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
chatml_datasets = sum(
(v for k, v in locals().items() if k.endswith('_chatml')),
[],
)
# Your Judge Model Configs Here
judge_cfg = dict()
for dataset in chatml_datasets:
if dataset['evaluator']['type'] == 'llm_evaluator':
dataset['evaluator']['judge_cfg'] = judge_cfg
if dataset['evaluator']['type'] == 'cascade_evaluator':
dataset['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=8),
runner=dict(
type=LocalRunner, task=dict(type=OpenICLEvalTask), max_num_workers=32
),
)
work_dir = 'outputs/ChatML_Datasets'
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.ChemBench.ChemBench_gen import \
chembench_datasets
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models
datasets = [*chembench_datasets]
models = [*models]
'''
dataset version metric mode mistral-7b-instruct-v0.2-hf
-------------------------------- --------- -------- ------ -----------------------------
ChemBench_Name_Conversion d4e6a1 accuracy gen 45.43
ChemBench_Property_Prediction d4e6a1 accuracy gen 47.11
ChemBench_Mol2caption d4e6a1 accuracy gen 64.21
ChemBench_Caption2mol d4e6a1 accuracy gen 35.38
ChemBench_Product_Prediction d4e6a1 accuracy gen 38.67
ChemBench_Retrosynthesis d4e6a1 accuracy gen 27
ChemBench_Yield_Prediction d4e6a1 accuracy gen 27
ChemBench_Temperature_Prediction d4e6a1 accuracy gen 26.73
ChemBench_Solvent_Prediction d4e6a1 accuracy gen 32.67
'''
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
# -------------Inference Stage ----------------------------------------
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='Qwen2.5-1.5B-Instruct',
path='Qwen/Qwen2.5-1.5B-Instruct',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(do_sample=True, ),
max_out_len=200,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
summarizer = dict(type=DefaultSubjectiveSummarizer)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
api_meta_template = dict(round=[
dict(role='SYSTEM', api_role='SYSTEM'),
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
# GPT4o
abbr='gpt-4o-0513-global',
type=OpenAI,
# gpt-4o
path='gpt-4o-0513-global',
key='xxx', # provide OPENAI_API_KEY
meta_template=api_meta_template,
query_per_second=16,
max_out_len=1000,
batch_size=8,
retry=3)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
work_dir = 'outputs/chinese_simpleqa/'
from copy import deepcopy
from lagent import ReAct
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models import HuggingFaceCausalLM
from opencompass.models.lagent import CodeAgent, LagentAgent
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# Note that it might occur cuda OOM error for hf model
from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
cibench_datasets as cibench_datasets_generation
from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
cibench_datasets as cibench_datasets_template
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.summarizers.cibench import summarizer
# Oracle mode for analysis
# from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
# from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
datasets = []
datasets += cibench_datasets_template
datasets += cibench_datasets_generation
# datasets += cibench_datasets_template_oracle
# datasets += cibench_datasets_generation_oracle
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
[])
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
actions = [
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
]
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
work_dir = './outputs/cibench/'
_agent_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if all(r['role'].upper() != 'SYSTEM'
for r in round): # no system round
if not any('api_role' in r for r in round):
m['meta_template']['round'].append(
dict(role='system', begin='System response:', end='\n'))
else:
m['meta_template']['round'].append(
dict(role='system', api_role='SYSTEM'))
print(
f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}'
)
_agent_models.append(m)
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
models = []
for m in _agent_models:
m = deepcopy(m)
origin_abbr = m.pop('abbr')
abbr = origin_abbr
m.pop('batch_size', None)
m.pop('max_out_len', None)
m.pop('max_seq_len', None)
run_cfg = m.pop('run_cfg', {})
agent_model = dict(
abbr=abbr,
summarizer_abbr=origin_abbr,
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=m,
actions=[
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
],
protocol=protocol,
batch_size=1,
run_cfg=run_cfg,
)
models.append(agent_model)
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from lagent.agents.react import ReActProtocol
from mmengine.config import read_base
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models import OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
cibench_datasets as cibench_datasets_generation
from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
cibench_datasets as cibench_datasets_template
# Oracle mode for analysis
# from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
# from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
from opencompass.configs.summarizers.cibench import summarizer
datasets = []
datasets += cibench_datasets_template
datasets += cibench_datasets_generation
# datasets += cibench_datasets_template_oracle
# datasets += cibench_datasets_generation_oracle
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
dict(role='SYSTEM', api_role='SYSTEM'),
], )
actions = [
dict(type=IPythonInterpreter,
user_data_dir='./data/cibench_dataset/datasources',
description=IPYTHON_INTERPRETER_DESCRIPTION)
]
protocol = dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)
work_dir = 'outputs/cibench/'
models = [
dict(
abbr='gpt-4o',
type=CodeAgent,
agent_type=CIReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-4o',
rpm_verbose=True,
retry=99,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=2048,
temperature=0,
),
actions=actions,
protocol=protocol,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLInferTask)),
)
from mmengine.config import read_base
from opencompass.datasets.circular import (
CircularARCDataset, CircularCEvalDataset, CircularCMMLUDataset,
CircularCSQADataset, CircularEvaluator, CircularHSWAGDataset,
CircularMMLUDataset, CircularOBQADataset, CircularRaceDataset)
from opencompass.summarizers import CircularSummarizer
with read_base():
from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import \
ARC_c_datasets
from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import \
ARC_e_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
ceval_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
cmmlu_datasets
from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import \
commonsenseqa_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import \
hellaswag_datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
models as hf_internlm_chat_7b_model
from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
models as hf_internlm_chat_20b_model
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
models as hf_qwen_7b_chat_model
from opencompass.configs.models.qwen.hf_qwen_14b_chat import \
models as hf_qwen_14b_chat_model
from opencompass.configs.summarizers.groups.ceval import \
ceval_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
for ds, t in [
(ceval_datasets, CircularCEvalDataset),
(mmlu_datasets, CircularMMLUDataset),
(cmmlu_datasets, CircularCMMLUDataset),
(hellaswag_datasets, CircularHSWAGDataset),
(ARC_e_datasets, CircularARCDataset),
(ARC_c_datasets, CircularARCDataset),
(commonsenseqa_datasets, CircularCSQADataset),
(obqa_datasets, CircularOBQADataset),
(race_datasets, CircularRaceDataset),
]:
for d in ds:
d['type'] = t
d['abbr'] = d['abbr'] + '-circular-4'
d['eval_cfg']['evaluator'] = {
'type': CircularEvaluator,
'circular_pattern': 'circular'
}
d['circular_patterns'] = 'circular'
datasets = sum([
v
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
# config summarizer
other_summary_groups = [
{
'name':
'average',
'subsets': [
'ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c',
'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high'
]
},
]
origin_summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
new_summary_groups = []
for item in origin_summary_groups:
new_summary_groups.append({
'name':
item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
})
summarizer = dict(
type=CircularSummarizer,
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs=[
'average-circular-4',
'ceval-circular-4',
'mmlu-circular-4',
'cmmlu-circular-4',
'hellaswag-circular-4',
'ARC-e-circular-4',
'ARC-c-circular-4',
'commonsense_qa-circular-4',
'openbookqa_fact-circular-4',
'race-middle-circular-4',
'race-high-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
'mmlu-humanities-circular-4',
'mmlu-stem-circular-4',
'mmlu-social-science-circular-4',
'mmlu-other-circular-4',
'cmmlu-humanities-circular-4',
'cmmlu-stem-circular-4',
'cmmlu-social-science-circular-4',
'cmmlu-other-circular-4',
'cmmlu-china-specific-circular-4',
],
summary_groups=new_summary_groups,
)
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.collections.chat_medium import datasets
from opencompass.configs.models.claude.claude import models
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)),
)
# This config is used for pass@k evaluation with `num_return_sequences`
# That model can generate multiple responses for single input
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
mbpp_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
sanitized_mbpp_datasets
datasets = []
datasets += humaneval_datasets
datasets += mbpp_datasets
datasets += sanitized_mbpp_datasets
models = [
dict(
type=HuggingFaceCausalLM,
abbr='CodeLlama-7b-Python',
path='codellama/CodeLlama-7b-Python-hf',
tokenizer_path='codellama/CodeLlama-7b-Python-hf',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=1024,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
generation_kwargs=dict(
num_return_sequences=10,
do_sample=True,
top_p=0.95,
temperature=0.8,
),
run_cfg=dict(num_gpus=1, num_procs=1),
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=300),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
# This config is used for pass@k evaluation with dataset repetition
# That model cannot generate multiple response for single input
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
humaneval_datasets
from opencompass.configs.datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import \
mbpp_datasets
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import \
sanitized_mbpp_datasets
datasets = []
datasets += humaneval_datasets
datasets += mbpp_datasets
datasets += sanitized_mbpp_datasets
_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
], )
models = [
dict(
abbr='internlm-chat-7b-hf-v11',
type=HuggingFaceCausalLM,
path='internlm/internlm-chat-7b-v1_1',
tokenizer_path='internlm/internlm-chat-7b-v1_1',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_seq_len=2048,
meta_template=_meta_template,
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
generation_kwargs=dict(
do_sample=True,
top_p=0.95,
temperature=0.8,
),
run_cfg=dict(num_gpus=1, num_procs=1),
batch_size=8,
)
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=600),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment