Unverified Commit b39f5015 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update taco (#1030)

parent 16f29b25
...@@ -12,7 +12,6 @@ compassbench_v1_knowledge_groups = [ ...@@ -12,7 +12,6 @@ compassbench_v1_knowledge_groups = [
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]}, {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
] ]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'knowledge_perf_4_and_cloze', 'knowledge_perf_4_and_cloze',
......
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups
compassbench_v1_language_names = [
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'acc_origin'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
]
compassbench_v1_language_summary_groups = [
{'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
{'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
{'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
{'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
{'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
{'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
]
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names = [
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
]
compassbench_v1_knowledge_summary_groups = [
{'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
{'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
]
compassbench_v1_reason_summary_groups = [
{'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']},
{'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']},
{'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']},
{'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
{'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
{'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']},
{'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']},
]
compassbench_v1_math_summary_groups = [
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
]
code_passk_summary_groups = [
# rename
{'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]},
{'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]},
{'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
# real add
{'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
{'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']},
{'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
{'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
]
agent_summary_groups = [
# dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
# dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
dict(name='cibench_template', subsets=['cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim']),
dict(name='cibench_template_cn', subsets=['cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim']),
dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
dict(name='agent', subsets=['agent_cn', 'agent_en']),
]
other_summary_groups = [
{
"name": "average_cn",
"subsets": [
["language_zh_perf_4_and_non_mcq", "naive_average"],
["knowledge_cn", "perf_4"],
["reasonbench_cn_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_cn", "naive_average"],
["code_cn", "naive_average"],
["agent_cn", "naive_average"],
],
},
{
"name": "average_en",
"subsets": [
["language_en_perf_4_and_non_mcq", "naive_average"],
["compassbench_v1_knowledge-mixed-cloze_en", "score"],
["reasonbench_en_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_en", "naive_average"],
["code_en", "naive_average"],
["agent_en", "naive_average"],
],
},
{
"name": "average",
"subsets": [
["language_perf_4_and_non_mcq", "naive_average"],
["knowledge_perf_4_and_cloze", "naive_average"],
["reasonbench", "perf_circular"],
["math_perf_4_and_fill_in_blank", "naive_average"],
["code", "naive_average"],
["agent", "naive_average"],
],
},
]
summarizer = dict(
dataset_abbrs=[
['average', 'naive_average'],
['average_cn', 'naive_average'],
['average_en', 'naive_average'],
'',
'',
'',
['language_perf_4_and_non_mcq', 'naive_average'],
['language_zh_perf_4_and_non_mcq', 'naive_average'],
['language_en_perf_4_and_non_mcq', 'naive_average'],
['intention_recognition_zh_circular', 'perf_circular'],
['intention_recognition_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_zh_circular', 'perf_circular'],
['content_critic_en_circular', 'perf_circular'],
['content_summarization_zh', 'rouge1'],
['content_summarization_en', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
['knowledge_perf_4_and_cloze', 'naive_average'],
['knowledge_cn', 'perf_4'],
['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
['reasonbench', 'perf_circular'],
['reasonbench_cn_circular', 'perf_circular'],
['reasonbench_en_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_circular', 'perf_circular'],
['reasonbench_cn_deductive_circular', 'perf_circular'],
['reasonbench_cn_inductive_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_en_abductive_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_circular', 'perf_circular'],
['math_perf_4_and_fill_in_blank', 'naive_average'],
['math_perf_4_and_fill_in_blank_cn', 'naive_average'],
['math_perf_4_and_fill_in_blank_en', 'naive_average'],
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
['code', 'naive_average'],
['code_cn', 'naive_average'],
['code_en', 'naive_average'],
['humaneval_cn_pass@1(greedy)', 'naive_average'],
['humaneval_plus_pass@1(greedy)', 'naive_average'],
['mbpp_cn_pass@1(greedy)', 'naive_average'],
['sanitized_mbpp_pass@1(greedy)', 'naive_average'],
['humanevalx', 'naive_average'],
['agent', 'naive_average'],
['agent_cn', 'naive_average'],
['agent_en', 'naive_average'],
['cibench_template_cn', 'naive_average'],
['cibench_template', 'naive_average'],
['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
['plugin_eval-mus-p10_one_review', 'naive_average'],
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
lcbench_summary_groups = [
{'name': 'lcbench', 'subsets': ['lcbench_en', 'lcbench_cn']},
]
mathbench_v1_summary_groups = [
{'name': 'mathbench-college_application', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-college-single_choice_en']},
{'name': 'mathbench-high_application', 'subsets': ['mathbench-high-single_choice_cn', 'mathbench-high-single_choice_en']},
{'name': 'mathbench-middle_application', 'subsets': ['mathbench-middle-single_choice_cn', 'mathbench-middle-single_choice_en']},
{'name': 'mathbench-primary_application', 'subsets': ['mathbench-primary-cloze_cn', 'mathbench-primary-cloze_en', 'mathbench-calculate-cloze_en'], 'weights': {'mathbench-primary-cloze_cn': 1, 'mathbench-primary-cloze_en': 1, 'mathbench-calculate-cloze_en': 2}},
{'name': 'mathbench-college_knowledge', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-college_knowledge-single_choice_en']},
{'name': 'mathbench-high_knowledge', 'subsets': ['mathbench-high_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_en']},
{'name': 'mathbench-middle_knowledge', 'subsets': ['mathbench-middle_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_en']},
{'name': 'mathbench-primary_knowledge', 'subsets': ['mathbench-primary_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_en']},
{'name': 'mathbench_application', 'subsets': ['mathbench-college_application', 'mathbench-high_application', 'mathbench-middle_application', 'mathbench-primary_application']},
{'name': 'mathbench_knowledge', 'subsets': ['mathbench-college_knowledge', 'mathbench-high_knowledge', 'mathbench-middle_knowledge', 'mathbench-primary_knowledge']},
{'name': 'mathbench', 'subsets': ['mathbench_application', 'mathbench_knowledge']},
]
...@@ -71,6 +71,40 @@ _base_summary_groups = [ ...@@ -71,6 +71,40 @@ _base_summary_groups = [
['plugin_eval-review_str_v1', 'review_quality'], ['plugin_eval-review_str_v1', 'review_quality'],
] ]
}, },
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
]
},
] ]
plugineval_summary_groups = [] plugineval_summary_groups = []
......
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'######## MathBench Accuracy ########', # category '######## MathBench Application Accuracy ########', # category
['mathbench-college-single_choice_cn', 'acc_1'], ['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-single_choice_en', 'acc_1'], ['mathbench-college-single_choice_en', 'acc_1'],
['mathbench-high-single_choice_cn', 'acc_1'], ['mathbench-high-single_choice_cn', 'acc_1'],
...@@ -9,15 +9,15 @@ summarizer = dict( ...@@ -9,15 +9,15 @@ summarizer = dict(
['mathbench-middle-single_choice_en', 'acc_1'], ['mathbench-middle-single_choice_en', 'acc_1'],
['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_cn', 'accuracy'],
['mathbench-primary-cloze_en', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy'],
['mathbench-calculate-cloze_en', 'accuracy'], ['mathbench-arithmetic-cloze_en', 'accuracy'],
'######## MathBench CircularEval ########', # category '######## MathBench Application CircularEval ########', # category
['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_cn', 'perf_4'],
['mathbench-college-single_choice_en', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4'],
['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4'],
['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_cn', 'perf_4'],
['mathbench-middle-single_choice_en', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4'],
'######## MathBench Knowledge ########', # category '######## MathBench Knowledge CircularEval ########', # category
['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_cn', 'perf_4'],
['mathbench-college_knowledge-single_choice_en', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4'],
['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high_knowledge-single_choice_cn', 'perf_4'],
...@@ -26,6 +26,15 @@ summarizer = dict( ...@@ -26,6 +26,15 @@ summarizer = dict(
['mathbench-middle_knowledge-single_choice_en', 'perf_4'], ['mathbench-middle_knowledge-single_choice_en', 'perf_4'],
['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_cn', 'perf_4'],
['mathbench-primary_knowledge-single_choice_en', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4'],
'######## MathBench Knowledge Accuracy ########', # category
['mathbench-college_knowledge-single_choice_cn', 'acc_1'],
['mathbench-college_knowledge-single_choice_en', 'acc_1'],
['mathbench-high_knowledge-single_choice_cn', 'acc_1'],
['mathbench-high_knowledge-single_choice_en', 'acc_1'],
['mathbench-middle_knowledge-single_choice_cn', 'acc_1'],
['mathbench-middle_knowledge-single_choice_en', 'acc_1'],
['mathbench-primary_knowledge-single_choice_cn', 'acc_1'],
['mathbench-primary_knowledge-single_choice_en', 'acc_1'],
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []) [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
import argparse
import getpass
import os
import os.path as osp
from datetime import datetime
from mmengine.config import Config, DictAction
from opencompass.partitioners import MultimodalNaivePartitioner
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
from opencompass.runners import SlurmRunner
from opencompass.summarizers import DefaultSummarizer
from opencompass.utils import LarkReporter, get_logger
from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
fill_infer_cfg, get_config_from_arg)
def parse_args():
parser = argparse.ArgumentParser(description='Run an evaluation task')
parser.add_argument('config', nargs='?', help='Train config file path')
# add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
# if "infer" or "eval" not specified
launch_method = parser.add_mutually_exclusive_group()
launch_method.add_argument('--slurm',
action='store_true',
default=False,
help='Whether to force tasks to run with srun. '
'If True, `--partition(-p)` must be set. '
'Defaults to False')
launch_method.add_argument('--dlc',
action='store_true',
default=False,
help='Whether to force tasks to run on dlc. If '
'True, `--aliyun-cfg` must be set. Defaults'
' to False')
# multi-modal support
parser.add_argument('--mm-eval',
help='Whether or not enable multimodal evaluation',
action='store_true',
default=False)
# Add shortcut parameters (models, datasets and summarizer)
parser.add_argument('--models', nargs='+', help='', default=None)
parser.add_argument('--datasets', nargs='+', help='', default=None)
parser.add_argument('--summarizer', help='', default=None)
# add general args
parser.add_argument('--debug',
help='Debug mode, in which scheduler will run tasks '
'in the single process, and output will not be '
'redirected to files',
action='store_true',
default=False)
parser.add_argument('--dry-run',
help='Dry run mode, in which the scheduler will not '
'actually run the tasks, but only print the commands '
'to run',
action='store_true',
default=False)
parser.add_argument('-m',
'--mode',
help='Running mode. You can choose "infer" if you '
'only want the inference results, or "eval" if you '
'already have the results and want to evaluate them, '
'or "viz" if you want to visualize the results.',
choices=['all', 'infer', 'eval', 'viz'],
default='all',
type=str)
parser.add_argument('-r',
'--reuse',
nargs='?',
type=str,
const='latest',
help='Reuse previous outputs & results, and run any '
'missing jobs presented in the config. If its '
'argument is not specified, the latest results in '
'the work_dir will be reused. The argument should '
'also be a specific timestamp, e.g. 20230516_144254')
parser.add_argument('-w',
'--work-dir',
help='Work path, all the outputs will be '
'saved in this path, including the slurm logs, '
'the evaluation results, the summary results, etc.'
'If not specified, the work_dir will be set to '
'./outputs/default.',
default=None,
type=str)
parser.add_argument(
'--config-dir',
default='configs',
help='Use the custom config directory instead of config/ to '
'search the configs for datasets, models and summarizers',
type=str)
parser.add_argument('-l',
'--lark',
help='Report the running status to lark bot',
action='store_true',
default=False)
parser.add_argument('--max-partition-size',
help='The maximum size of an infer task. Only '
'effective when "infer" is missing from the config.',
type=int,
default=40000),
parser.add_argument(
'--gen-task-coef',
help='The dataset cost measurement coefficient for generation tasks, '
'Only effective when "infer" is missing from the config.',
type=int,
default=20)
parser.add_argument('--max-num-workers',
help='Max number of workers to run in parallel. '
'Will be overrideen by the "max_num_workers" argument '
'in the config.',
type=int,
default=32)
parser.add_argument('--max-workers-per-gpu',
help='Max task to run in parallel on one GPU. '
'It will only be used in the local runner.',
type=int,
default=1)
parser.add_argument(
'--retry',
help='Number of retries if the job failed when using slurm or dlc. '
'Will be overrideen by the "retry" argument in the config.',
type=int,
default=2)
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
)
# set srun args
slurm_parser = parser.add_argument_group('slurm_args')
parse_slurm_args(slurm_parser)
# set dlc args
dlc_parser = parser.add_argument_group('dlc_args')
parse_dlc_args(dlc_parser)
# set hf args
hf_parser = parser.add_argument_group('hf_args')
parse_hf_args(hf_parser)
# set custom dataset args
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
parse_custom_dataset_args(custom_dataset_parser)
args = parser.parse_args()
if args.slurm:
assert args.partition is not None, (
'--partition(-p) must be set if you want to use slurm')
if args.dlc:
assert os.path.exists(args.aliyun_cfg), (
'When launching tasks using dlc, it needs to be configured '
'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
' to specify a new path.')
return args
def parse_slurm_args(slurm_parser):
"""These args are all for slurm launch."""
slurm_parser.add_argument('-p',
'--partition',
help='Slurm partition name',
default=None,
type=str)
slurm_parser.add_argument('-q',
'--quotatype',
help='Slurm quota type',
default=None,
type=str)
slurm_parser.add_argument('--qos',
help='Slurm quality of service',
default=None,
type=str)
def parse_dlc_args(dlc_parser):
"""These args are all for dlc launch."""
dlc_parser.add_argument('--aliyun-cfg',
help='The config path for aliyun config',
default='~/.aliyun.cfg',
type=str)
def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-path', type=str)
hf_parser.add_argument('--peft-path', type=str)
hf_parser.add_argument('--tokenizer-path', type=str)
hf_parser.add_argument('--model-kwargs',
nargs='+',
action=DictAction,
default={})
hf_parser.add_argument('--tokenizer-kwargs',
nargs='+',
action=DictAction,
default={})
hf_parser.add_argument('--max-out-len', type=int)
hf_parser.add_argument('--max-seq-len', type=int)
hf_parser.add_argument('--no-batch-padding',
action='store_true',
default=False)
hf_parser.add_argument('--batch-size', type=int)
hf_parser.add_argument('--num-gpus', type=int)
hf_parser.add_argument('--pad-token-id', type=int)
def parse_custom_dataset_args(custom_dataset_parser):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-data-type',
type=str,
choices=['mcq', 'qa'])
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
type=str,
choices=['gen', 'ppl'])
def main():
args = parse_args()
if args.dry_run:
args.debug = True
# initialize logger
logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
cfg = get_config_from_arg(args)
if args.work_dir is not None:
cfg['work_dir'] = args.work_dir
else:
cfg.setdefault('work_dir', './outputs/default/')
# cfg_time_str defaults to the current time
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
if args.reuse:
if args.reuse == 'latest':
if not os.path.exists(cfg.work_dir) or not os.listdir(
cfg.work_dir):
logger.warning('No previous results to reuse!')
else:
dirs = os.listdir(cfg.work_dir)
dir_time_str = sorted(dirs)[-1]
else:
dir_time_str = args.reuse
logger.info(f'Reusing experiements from {dir_time_str}')
elif args.mode in ['eval', 'viz']:
raise ValueError('You must specify -r or --reuse when running in eval '
'or viz mode!')
# update "actual" work_dir
cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
# dump config
output_config_path = osp.join(cfg.work_dir, 'configs',
f'{cfg_time_str}.py')
cfg.dump(output_config_path)
# Config is intentally reloaded here to avoid initialized
# types cannot be serialized
cfg = Config.fromfile(output_config_path, format_python_code=False)
# report to lark bot if specify --lark
if not args.lark:
cfg['lark_bot_url'] = None
elif cfg.get('lark_bot_url', None):
content = f'{getpass.getuser()}\'s task has been launched!'
LarkReporter(cfg['lark_bot_url']).post(content)
if args.mode in ['all', 'infer']:
# When user have specified --slurm or --dlc, or have not set
# "infer" in config, we will provide a default configuration
# for infer
if (args.dlc or args.slurm) and cfg.get('infer', None):
logger.warning('You have set "infer" in the config, but '
'also specified --slurm or --dlc. '
'The "infer" configuration will be overridden by '
'your runtime arguments.')
# Check whether run multimodal evaluation
if args.mm_eval:
partitioner = MultimodalNaivePartitioner(
osp.join(cfg['work_dir'], 'predictions/'))
tasks = partitioner(cfg)
exec_mm_infer_runner(tasks, args, cfg)
return
if args.dlc or args.slurm or cfg.get('infer', None) is None:
fill_infer_cfg(cfg, args)
if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.infer.runner.partition = args.partition
cfg.infer.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.infer.runner.debug = True
if args.lark:
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'predictions/')
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.infer.runner)
# Add extra attack config if exists
if hasattr(cfg, 'attack'):
for task in tasks:
cfg.attack.dataset = task.datasets[0][0].abbr
task.attack = cfg.attack
runner(tasks)
# evaluate
if args.mode in ['all', 'eval']:
# When user have specified --slurm or --dlc, or have not set
# "eval" in config, we will provide a default configuration
# for eval
if (args.dlc or args.slurm) and cfg.get('eval', None):
logger.warning('You have set "eval" in the config, but '
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
cfg.eval.runner.task.dump_details = True
if args.partition is not None:
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
cfg.eval.runner.partition = args.partition
cfg.eval.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.eval.runner.debug = True
if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.eval.runner)
# For meta-review-judge in subjective evaluation
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
for task_part in tasks:
runner(task_part)
else:
runner(tasks)
# visualize
if args.mode in ['all', 'eval', 'viz']:
summarizer_cfg = cfg.get('summarizer', {})
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
summarizer_cfg['type'] = DefaultSummarizer
summarizer_cfg['config'] = cfg
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=cfg_time_str)
if __name__ == '__main__':
main()
import ast import ast
import networkx as nx try:
import networkx as nx
except ImportError:
nx = None
from datasets import Dataset from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
......
import ast import ast
import json import json
import networkx as nx try:
import networkx as nx
except ImportError:
nx = None
import pandas as pd import pandas as pd
from datasets import Dataset from datasets import Dataset
......
import ast import ast
import json import json
import networkx as nx try:
import networkx as nx
except ImportError:
nx = None
from datasets import Dataset from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
......
...@@ -3,6 +3,7 @@ from .afqmcd import * # noqa: F401, F403 ...@@ -3,6 +3,7 @@ from .afqmcd import * # noqa: F401, F403
from .agieval import * # noqa: F401, F403 from .agieval import * # noqa: F401, F403
from .anli import AnliDataset # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403
from .anthropics_evals import * # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403
from .apps import * # noqa: F401, F403
from .arc import * # noqa: F401, F403 from .arc import * # noqa: F401, F403
from .ax import * # noqa: F401, F403 from .ax import * # noqa: F401, F403
from .bbh import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403
...@@ -94,6 +95,7 @@ from .summedits import * # noqa: F401, F403 ...@@ -94,6 +95,7 @@ from .summedits import * # noqa: F401, F403
from .summscreen import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403
from .svamp import * # noqa: F401, F403 from .svamp import * # noqa: F401, F403
from .tabmwp import * # noqa: F401, F403 from .tabmwp import * # noqa: F401, F403
from .taco import * # noqa: F401, F403
from .teval import * # noqa: F401, F403 from .teval import * # noqa: F401, F403
from .TheoremQA import * # noqa: F401, F403 from .TheoremQA import * # noqa: F401, F403
from .tnews import * # noqa: F401, F403 from .tnews import * # noqa: F401, F403
......
...@@ -19,13 +19,19 @@ from unittest.mock import mock_open, patch ...@@ -19,13 +19,19 @@ from unittest.mock import mock_open, patch
import numpy as np import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from pyext import RuntimeModule
try:
from pyext import RuntimeModule
except ImportError:
RuntimeModule = None
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
...@@ -67,18 +73,20 @@ class APPSDataset(BaseDataset): ...@@ -67,18 +73,20 @@ class APPSDataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data) new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate # num_repeats duplicate
train_repeated = [] # train_repeated = []
test_repeated = [] test_repeated = []
for sample in new_dataset['train']: # for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats) # train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']: for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats) test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated) # dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated) dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({ return DatasetDict({
'train': dataset_train_repeated, # 'train': dataset_train_repeated,
'test': dataset_test_repeated 'test': dataset_test_repeated
}) })
...@@ -121,18 +129,20 @@ class APPS_miniDataset(BaseDataset): ...@@ -121,18 +129,20 @@ class APPS_miniDataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data) new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate # num_repeats duplicate
train_repeated = [] # train_repeated = []
test_repeated = [] test_repeated = []
for sample in new_dataset['train']: # for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats) # train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']: for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats) test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated) # dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated) dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({ return DatasetDict({
'train': dataset_train_repeated, # 'train': dataset_train_repeated,
'test': dataset_test_repeated 'test': dataset_test_repeated
}) })
...@@ -308,7 +318,10 @@ def timeout_handler(signum, frame): ...@@ -308,7 +318,10 @@ def timeout_handler(signum, frame):
raise TimeoutException raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler) try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds
......
...@@ -210,6 +210,8 @@ def make_mcq_gen_config(meta): ...@@ -210,6 +210,8 @@ def make_mcq_gen_config(meta):
input_columns=meta['input_columns'], input_columns=meta['input_columns'],
output_column=meta['output_column'], output_column=meta['output_column'],
) )
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict( infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
...@@ -255,6 +257,8 @@ def make_circular_mcq_gen_config(meta): ...@@ -255,6 +257,8 @@ def make_circular_mcq_gen_config(meta):
input_columns=meta['input_columns'], input_columns=meta['input_columns'],
output_column=meta['output_column'], output_column=meta['output_column'],
) )
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict( infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
...@@ -304,6 +308,8 @@ def make_qa_gen_config(meta): ...@@ -304,6 +308,8 @@ def make_qa_gen_config(meta):
input_columns=meta['input_columns'], input_columns=meta['input_columns'],
output_column=meta['output_column'], output_column=meta['output_column'],
) )
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict( infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
...@@ -353,6 +359,8 @@ def make_mcq_ppl_config(meta): ...@@ -353,6 +359,8 @@ def make_mcq_ppl_config(meta):
input_columns=meta['input_columns'], input_columns=meta['input_columns'],
output_column=meta['output_column'], output_column=meta['output_column'],
) )
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict( infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
...@@ -399,6 +407,8 @@ def make_circular_mcq_ppl_config(meta): ...@@ -399,6 +407,8 @@ def make_circular_mcq_ppl_config(meta):
input_columns=meta['input_columns'], input_columns=meta['input_columns'],
output_column=meta['output_column'], output_column=meta['output_column'],
) )
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict( infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
......
...@@ -168,9 +168,12 @@ def _clean_up_code(text: str, language_type: str, reference) -> str: ...@@ -168,9 +168,12 @@ def _clean_up_code(text: str, language_type: str, reference) -> str:
"""Cleans up the generated code.""" """Cleans up the generated code."""
try: try:
# for chatGLM related text # for chatGLM related text
text = eval(text) eval_text = eval(text)
except Exception: except Exception:
pass pass
else:
if isinstance(eval_text, str):
text = eval_text
# extract code from code block # extract code from code block
text = text.lstrip('\n') text = text.lstrip('\n')
if '```' in text: if '```' in text:
......
import re import re
import cn2an
""" """
task: law article prediction task: law article prediction
...@@ -15,6 +14,7 @@ def compute_ljp_article(data_dict): ...@@ -15,6 +14,7 @@ def compute_ljp_article(data_dict):
A reference contains a list of articles of the Criminal Law of the People's Republic of China. A reference contains a list of articles of the Criminal Law of the People's Republic of China.
We compute the F1-score between the prediction and the reference. We compute the F1-score between the prediction and the reference.
""" """
import cn2an
score_list, abstentions = [], 0 score_list, abstentions = [], 0
......
import math import math
import cn2an
import re import re
#法律判决预测-刑期预测 #法律判决预测-刑期预测
def compute_ljp_imprison(data_dict): def compute_ljp_imprison(data_dict):
import cn2an
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
......
...@@ -85,6 +85,7 @@ def normalize_final_answer(final_answer: str) -> str: ...@@ -85,6 +85,7 @@ def normalize_final_answer(final_answer: str) -> str:
# Extract answer that is in LaTeX math, is bold, # Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc. # is surrounded by a box, etc.
final_answer = re.sub(r'(\\text\{)\((.*?)\)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer)
...@@ -178,10 +179,7 @@ class MATHEvaluator(BaseEvaluator): ...@@ -178,10 +179,7 @@ class MATHEvaluator(BaseEvaluator):
def score(self, predictions, references): def score(self, predictions, references):
if len(predictions) != len(references): if len(predictions) != len(references):
return { return {'error': 'preds and refrs have different length'}
'error': 'predictions and references have different '
'length'
}
correct = 0 correct = 0
count = 0 count = 0
details = [] details = []
...@@ -457,8 +455,23 @@ class MATHEvaluator(BaseEvaluator): ...@@ -457,8 +455,23 @@ class MATHEvaluator(BaseEvaluator):
ss2 = strip_string_func(str2) ss2 = strip_string_func(str2)
if verbose: if verbose:
print(ss1, ss2) print(ss1, ss2)
return ss1 == ss2 if ss1 == ss2:
return True
ss1 = normalize_final_answer(ss1)
ss2 = normalize_final_answer(ss2)
if ss1 == ss2:
return True
except Exception:
pass
try:
ss1 = normalize_final_answer(str1)
ss2 = normalize_final_answer(str2)
if ss1 == ss2:
return True
except Exception: except Exception:
pass
return str1 == str2 return str1 == str2
......
...@@ -57,7 +57,7 @@ class MathBenchDataset(BaseDataset): ...@@ -57,7 +57,7 @@ class MathBenchDataset(BaseDataset):
""" """
data = [] data = []
filename = osp.join(path, f'{name}.jsonl') filename = osp.join(path, f'{name}.jsonl')
with open(filename, 'r') as infile: with open(filename, 'r', encoding='utf-8') as infile:
for id, line in enumerate(infile): for id, line in enumerate(infile):
entry = json.loads(line) entry = json.loads(line)
if 'cloze' in name: if 'cloze' in name:
......
...@@ -244,6 +244,7 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -244,6 +244,7 @@ class MBPPEvaluator(BaseEvaluator):
if not isinstance(preds, list): if not isinstance(preds, list):
preds = [preds] preds = [preds]
for pred in preds: for pred in preds:
pred = self._process_answer(pred)
mbpp_preds.append({'task_id': refer, 'solution': pred}) mbpp_preds.append({'task_id': refer, 'solution': pred})
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'mbpp_eval.jsonl') out_dir = osp.join(tmp_dir, 'mbpp_eval.jsonl')
......
...@@ -18,14 +18,20 @@ from io import StringIO ...@@ -18,14 +18,20 @@ from io import StringIO
from unittest.mock import mock_open, patch from unittest.mock import mock_open, patch
import numpy as np import numpy as np
from datasets import Dataset, DatasetDict, load_dataset from datasets import Dataset, DatasetDict, load_from_disk
from pyext import RuntimeModule
try:
from pyext import RuntimeModule
except ImportError:
RuntimeModule = None
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
...@@ -34,7 +40,7 @@ class TACODataset(BaseDataset): ...@@ -34,7 +40,7 @@ class TACODataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str, num_repeats: int = 1): def load(path: str, num_repeats: int = 1):
dataset = load_dataset(path) dataset = load_from_disk(path)
new_dataset = DatasetDict() new_dataset = DatasetDict()
# add new column "starter" in the prompt # add new column "starter" in the prompt
for split in dataset.keys(): for split in dataset.keys():
...@@ -69,18 +75,20 @@ class TACODataset(BaseDataset): ...@@ -69,18 +75,20 @@ class TACODataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data) new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate # num_repeats duplicate
train_repeated = [] # train_repeated = []
test_repeated = [] test_repeated = []
for sample in new_dataset['train']: # for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats) # train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']: for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats) test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated) # dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated) dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({ return DatasetDict({
'train': dataset_train_repeated, # 'train': dataset_train_repeated,
'test': dataset_test_repeated 'test': dataset_test_repeated
}) })
...@@ -256,7 +264,10 @@ def timeout_handler(signum, frame): ...@@ -256,7 +264,10 @@ def timeout_handler(signum, frame):
raise TimeoutException raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler) try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment