Unverified Commit b39f5015 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update taco (#1030)

parent 16f29b25
......@@ -12,7 +12,6 @@ compassbench_v1_knowledge_groups = [
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer = dict(
dataset_abbrs=[
'knowledge_perf_4_and_cloze',
......
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups
compassbench_v1_language_names = [
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'acc_origin'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
]
compassbench_v1_language_summary_groups = [
{'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
{'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
{'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
{'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
{'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
{'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
]
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names = [
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
]
compassbench_v1_knowledge_summary_groups = [
{'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
{'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
]
compassbench_v1_reason_summary_groups = [
{'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']},
{'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']},
{'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']},
{'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
{'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
{'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']},
{'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']},
]
compassbench_v1_math_summary_groups = [
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
]
code_passk_summary_groups = [
# rename
{'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]},
{'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]},
{'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
# real add
{'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
{'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']},
{'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
{'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
]
agent_summary_groups = [
# dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
# dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
dict(name='cibench_template', subsets=['cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim']),
dict(name='cibench_template_cn', subsets=['cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim']),
dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
dict(name='agent', subsets=['agent_cn', 'agent_en']),
]
other_summary_groups = [
{
"name": "average_cn",
"subsets": [
["language_zh_perf_4_and_non_mcq", "naive_average"],
["knowledge_cn", "perf_4"],
["reasonbench_cn_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_cn", "naive_average"],
["code_cn", "naive_average"],
["agent_cn", "naive_average"],
],
},
{
"name": "average_en",
"subsets": [
["language_en_perf_4_and_non_mcq", "naive_average"],
["compassbench_v1_knowledge-mixed-cloze_en", "score"],
["reasonbench_en_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_en", "naive_average"],
["code_en", "naive_average"],
["agent_en", "naive_average"],
],
},
{
"name": "average",
"subsets": [
["language_perf_4_and_non_mcq", "naive_average"],
["knowledge_perf_4_and_cloze", "naive_average"],
["reasonbench", "perf_circular"],
["math_perf_4_and_fill_in_blank", "naive_average"],
["code", "naive_average"],
["agent", "naive_average"],
],
},
]
summarizer = dict(
dataset_abbrs=[
['average', 'naive_average'],
['average_cn', 'naive_average'],
['average_en', 'naive_average'],
'',
'',
'',
['language_perf_4_and_non_mcq', 'naive_average'],
['language_zh_perf_4_and_non_mcq', 'naive_average'],
['language_en_perf_4_and_non_mcq', 'naive_average'],
['intention_recognition_zh_circular', 'perf_circular'],
['intention_recognition_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_zh_circular', 'perf_circular'],
['content_critic_en_circular', 'perf_circular'],
['content_summarization_zh', 'rouge1'],
['content_summarization_en', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
['knowledge_perf_4_and_cloze', 'naive_average'],
['knowledge_cn', 'perf_4'],
['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
['reasonbench', 'perf_circular'],
['reasonbench_cn_circular', 'perf_circular'],
['reasonbench_en_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_circular', 'perf_circular'],
['reasonbench_cn_deductive_circular', 'perf_circular'],
['reasonbench_cn_inductive_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_en_abductive_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_circular', 'perf_circular'],
['math_perf_4_and_fill_in_blank', 'naive_average'],
['math_perf_4_and_fill_in_blank_cn', 'naive_average'],
['math_perf_4_and_fill_in_blank_en', 'naive_average'],
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
['code', 'naive_average'],
['code_cn', 'naive_average'],
['code_en', 'naive_average'],
['humaneval_cn_pass@1(greedy)', 'naive_average'],
['humaneval_plus_pass@1(greedy)', 'naive_average'],
['mbpp_cn_pass@1(greedy)', 'naive_average'],
['sanitized_mbpp_pass@1(greedy)', 'naive_average'],
['humanevalx', 'naive_average'],
['agent', 'naive_average'],
['agent_cn', 'naive_average'],
['agent_en', 'naive_average'],
['cibench_template_cn', 'naive_average'],
['cibench_template', 'naive_average'],
['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
['plugin_eval-mus-p10_one_review', 'naive_average'],
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
lcbench_summary_groups = [
{'name': 'lcbench', 'subsets': ['lcbench_en', 'lcbench_cn']},
]
mathbench_v1_summary_groups = [
{'name': 'mathbench-college_application', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-college-single_choice_en']},
{'name': 'mathbench-high_application', 'subsets': ['mathbench-high-single_choice_cn', 'mathbench-high-single_choice_en']},
{'name': 'mathbench-middle_application', 'subsets': ['mathbench-middle-single_choice_cn', 'mathbench-middle-single_choice_en']},
{'name': 'mathbench-primary_application', 'subsets': ['mathbench-primary-cloze_cn', 'mathbench-primary-cloze_en', 'mathbench-calculate-cloze_en'], 'weights': {'mathbench-primary-cloze_cn': 1, 'mathbench-primary-cloze_en': 1, 'mathbench-calculate-cloze_en': 2}},
{'name': 'mathbench-college_knowledge', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-college_knowledge-single_choice_en']},
{'name': 'mathbench-high_knowledge', 'subsets': ['mathbench-high_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_en']},
{'name': 'mathbench-middle_knowledge', 'subsets': ['mathbench-middle_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_en']},
{'name': 'mathbench-primary_knowledge', 'subsets': ['mathbench-primary_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_en']},
{'name': 'mathbench_application', 'subsets': ['mathbench-college_application', 'mathbench-high_application', 'mathbench-middle_application', 'mathbench-primary_application']},
{'name': 'mathbench_knowledge', 'subsets': ['mathbench-college_knowledge', 'mathbench-high_knowledge', 'mathbench-middle_knowledge', 'mathbench-primary_knowledge']},
{'name': 'mathbench', 'subsets': ['mathbench_application', 'mathbench_knowledge']},
]
......@@ -71,6 +71,40 @@ _base_summary_groups = [
['plugin_eval-review_str_v1', 'review_quality'],
]
},
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
]
},
]
plugineval_summary_groups = []
......
summarizer = dict(
dataset_abbrs=[
'######## MathBench Accuracy ########', # category
'######## MathBench Application Accuracy ########', # category
['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-single_choice_en', 'acc_1'],
['mathbench-high-single_choice_cn', 'acc_1'],
......@@ -9,15 +9,15 @@ summarizer = dict(
['mathbench-middle-single_choice_en', 'acc_1'],
['mathbench-primary-cloze_cn', 'accuracy'],
['mathbench-primary-cloze_en', 'accuracy'],
['mathbench-calculate-cloze_en', 'accuracy'],
'######## MathBench CircularEval ########', # category
['mathbench-arithmetic-cloze_en', 'accuracy'],
'######## MathBench Application CircularEval ########', # category
['mathbench-college-single_choice_cn', 'perf_4'],
['mathbench-college-single_choice_en', 'perf_4'],
['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'],
['mathbench-middle-single_choice_cn', 'perf_4'],
['mathbench-middle-single_choice_en', 'perf_4'],
'######## MathBench Knowledge ########', # category
'######## MathBench Knowledge CircularEval ########', # category
['mathbench-college_knowledge-single_choice_cn', 'perf_4'],
['mathbench-college_knowledge-single_choice_en', 'perf_4'],
['mathbench-high_knowledge-single_choice_cn', 'perf_4'],
......@@ -26,6 +26,15 @@ summarizer = dict(
['mathbench-middle_knowledge-single_choice_en', 'perf_4'],
['mathbench-primary_knowledge-single_choice_cn', 'perf_4'],
['mathbench-primary_knowledge-single_choice_en', 'perf_4'],
'######## MathBench Knowledge Accuracy ########', # category
['mathbench-college_knowledge-single_choice_cn', 'acc_1'],
['mathbench-college_knowledge-single_choice_en', 'acc_1'],
['mathbench-high_knowledge-single_choice_cn', 'acc_1'],
['mathbench-high_knowledge-single_choice_en', 'acc_1'],
['mathbench-middle_knowledge-single_choice_cn', 'acc_1'],
['mathbench-middle_knowledge-single_choice_en', 'acc_1'],
['mathbench-primary_knowledge-single_choice_cn', 'acc_1'],
['mathbench-primary_knowledge-single_choice_en', 'acc_1'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
import argparse
import getpass
import os
import os.path as osp
from datetime import datetime
from mmengine.config import Config, DictAction
from opencompass.partitioners import MultimodalNaivePartitioner
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
from opencompass.runners import SlurmRunner
from opencompass.summarizers import DefaultSummarizer
from opencompass.utils import LarkReporter, get_logger
from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
fill_infer_cfg, get_config_from_arg)
def parse_args():
parser = argparse.ArgumentParser(description='Run an evaluation task')
parser.add_argument('config', nargs='?', help='Train config file path')
# add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
# if "infer" or "eval" not specified
launch_method = parser.add_mutually_exclusive_group()
launch_method.add_argument('--slurm',
action='store_true',
default=False,
help='Whether to force tasks to run with srun. '
'If True, `--partition(-p)` must be set. '
'Defaults to False')
launch_method.add_argument('--dlc',
action='store_true',
default=False,
help='Whether to force tasks to run on dlc. If '
'True, `--aliyun-cfg` must be set. Defaults'
' to False')
# multi-modal support
parser.add_argument('--mm-eval',
help='Whether or not enable multimodal evaluation',
action='store_true',
default=False)
# Add shortcut parameters (models, datasets and summarizer)
parser.add_argument('--models', nargs='+', help='', default=None)
parser.add_argument('--datasets', nargs='+', help='', default=None)
parser.add_argument('--summarizer', help='', default=None)
# add general args
parser.add_argument('--debug',
help='Debug mode, in which scheduler will run tasks '
'in the single process, and output will not be '
'redirected to files',
action='store_true',
default=False)
parser.add_argument('--dry-run',
help='Dry run mode, in which the scheduler will not '
'actually run the tasks, but only print the commands '
'to run',
action='store_true',
default=False)
parser.add_argument('-m',
'--mode',
help='Running mode. You can choose "infer" if you '
'only want the inference results, or "eval" if you '
'already have the results and want to evaluate them, '
'or "viz" if you want to visualize the results.',
choices=['all', 'infer', 'eval', 'viz'],
default='all',
type=str)
parser.add_argument('-r',
'--reuse',
nargs='?',
type=str,
const='latest',
help='Reuse previous outputs & results, and run any '
'missing jobs presented in the config. If its '
'argument is not specified, the latest results in '
'the work_dir will be reused. The argument should '
'also be a specific timestamp, e.g. 20230516_144254')
parser.add_argument('-w',
'--work-dir',
help='Work path, all the outputs will be '
'saved in this path, including the slurm logs, '
'the evaluation results, the summary results, etc.'
'If not specified, the work_dir will be set to '
'./outputs/default.',
default=None,
type=str)
parser.add_argument(
'--config-dir',
default='configs',
help='Use the custom config directory instead of config/ to '
'search the configs for datasets, models and summarizers',
type=str)
parser.add_argument('-l',
'--lark',
help='Report the running status to lark bot',
action='store_true',
default=False)
parser.add_argument('--max-partition-size',
help='The maximum size of an infer task. Only '
'effective when "infer" is missing from the config.',
type=int,
default=40000),
parser.add_argument(
'--gen-task-coef',
help='The dataset cost measurement coefficient for generation tasks, '
'Only effective when "infer" is missing from the config.',
type=int,
default=20)
parser.add_argument('--max-num-workers',
help='Max number of workers to run in parallel. '
'Will be overrideen by the "max_num_workers" argument '
'in the config.',
type=int,
default=32)
parser.add_argument('--max-workers-per-gpu',
help='Max task to run in parallel on one GPU. '
'It will only be used in the local runner.',
type=int,
default=1)
parser.add_argument(
'--retry',
help='Number of retries if the job failed when using slurm or dlc. '
'Will be overrideen by the "retry" argument in the config.',
type=int,
default=2)
parser.add_argument(
'--dump-eval-details',
help='Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.',
action='store_true',
)
# set srun args
slurm_parser = parser.add_argument_group('slurm_args')
parse_slurm_args(slurm_parser)
# set dlc args
dlc_parser = parser.add_argument_group('dlc_args')
parse_dlc_args(dlc_parser)
# set hf args
hf_parser = parser.add_argument_group('hf_args')
parse_hf_args(hf_parser)
# set custom dataset args
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
parse_custom_dataset_args(custom_dataset_parser)
args = parser.parse_args()
if args.slurm:
assert args.partition is not None, (
'--partition(-p) must be set if you want to use slurm')
if args.dlc:
assert os.path.exists(args.aliyun_cfg), (
'When launching tasks using dlc, it needs to be configured '
'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
' to specify a new path.')
return args
def parse_slurm_args(slurm_parser):
"""These args are all for slurm launch."""
slurm_parser.add_argument('-p',
'--partition',
help='Slurm partition name',
default=None,
type=str)
slurm_parser.add_argument('-q',
'--quotatype',
help='Slurm quota type',
default=None,
type=str)
slurm_parser.add_argument('--qos',
help='Slurm quality of service',
default=None,
type=str)
def parse_dlc_args(dlc_parser):
"""These args are all for dlc launch."""
dlc_parser.add_argument('--aliyun-cfg',
help='The config path for aliyun config',
default='~/.aliyun.cfg',
type=str)
def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-path', type=str)
hf_parser.add_argument('--peft-path', type=str)
hf_parser.add_argument('--tokenizer-path', type=str)
hf_parser.add_argument('--model-kwargs',
nargs='+',
action=DictAction,
default={})
hf_parser.add_argument('--tokenizer-kwargs',
nargs='+',
action=DictAction,
default={})
hf_parser.add_argument('--max-out-len', type=int)
hf_parser.add_argument('--max-seq-len', type=int)
hf_parser.add_argument('--no-batch-padding',
action='store_true',
default=False)
hf_parser.add_argument('--batch-size', type=int)
hf_parser.add_argument('--num-gpus', type=int)
hf_parser.add_argument('--pad-token-id', type=int)
def parse_custom_dataset_args(custom_dataset_parser):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-data-type',
type=str,
choices=['mcq', 'qa'])
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
type=str,
choices=['gen', 'ppl'])
def main():
args = parse_args()
if args.dry_run:
args.debug = True
# initialize logger
logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
cfg = get_config_from_arg(args)
if args.work_dir is not None:
cfg['work_dir'] = args.work_dir
else:
cfg.setdefault('work_dir', './outputs/default/')
# cfg_time_str defaults to the current time
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
if args.reuse:
if args.reuse == 'latest':
if not os.path.exists(cfg.work_dir) or not os.listdir(
cfg.work_dir):
logger.warning('No previous results to reuse!')
else:
dirs = os.listdir(cfg.work_dir)
dir_time_str = sorted(dirs)[-1]
else:
dir_time_str = args.reuse
logger.info(f'Reusing experiements from {dir_time_str}')
elif args.mode in ['eval', 'viz']:
raise ValueError('You must specify -r or --reuse when running in eval '
'or viz mode!')
# update "actual" work_dir
cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
# dump config
output_config_path = osp.join(cfg.work_dir, 'configs',
f'{cfg_time_str}.py')
cfg.dump(output_config_path)
# Config is intentally reloaded here to avoid initialized
# types cannot be serialized
cfg = Config.fromfile(output_config_path, format_python_code=False)
# report to lark bot if specify --lark
if not args.lark:
cfg['lark_bot_url'] = None
elif cfg.get('lark_bot_url', None):
content = f'{getpass.getuser()}\'s task has been launched!'
LarkReporter(cfg['lark_bot_url']).post(content)
if args.mode in ['all', 'infer']:
# When user have specified --slurm or --dlc, or have not set
# "infer" in config, we will provide a default configuration
# for infer
if (args.dlc or args.slurm) and cfg.get('infer', None):
logger.warning('You have set "infer" in the config, but '
'also specified --slurm or --dlc. '
'The "infer" configuration will be overridden by '
'your runtime arguments.')
# Check whether run multimodal evaluation
if args.mm_eval:
partitioner = MultimodalNaivePartitioner(
osp.join(cfg['work_dir'], 'predictions/'))
tasks = partitioner(cfg)
exec_mm_infer_runner(tasks, args, cfg)
return
if args.dlc or args.slurm or cfg.get('infer', None) is None:
fill_infer_cfg(cfg, args)
if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.infer.runner.partition = args.partition
cfg.infer.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.infer.runner.debug = True
if args.lark:
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'predictions/')
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.infer.runner)
# Add extra attack config if exists
if hasattr(cfg, 'attack'):
for task in tasks:
cfg.attack.dataset = task.datasets[0][0].abbr
task.attack = cfg.attack
runner(tasks)
# evaluate
if args.mode in ['all', 'eval']:
# When user have specified --slurm or --dlc, or have not set
# "eval" in config, we will provide a default configuration
# for eval
if (args.dlc or args.slurm) and cfg.get('eval', None):
logger.warning('You have set "eval" in the config, but '
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None:
fill_eval_cfg(cfg, args)
if args.dump_eval_details:
cfg.eval.runner.task.dump_details = True
if args.partition is not None:
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
cfg.eval.runner.partition = args.partition
cfg.eval.runner.quotatype = args.quotatype
else:
logger.warning('SlurmRunner is not used, so the partition '
'argument is ignored.')
if args.debug:
cfg.eval.runner.debug = True
if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.eval.runner)
# For meta-review-judge in subjective evaluation
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
for task_part in tasks:
runner(task_part)
else:
runner(tasks)
# visualize
if args.mode in ['all', 'eval', 'viz']:
summarizer_cfg = cfg.get('summarizer', {})
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
summarizer_cfg['type'] = DefaultSummarizer
summarizer_cfg['config'] = cfg
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=cfg_time_str)
if __name__ == '__main__':
main()
import ast
import networkx as nx
try:
import networkx as nx
except ImportError:
nx = None
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
......
import ast
import json
import networkx as nx
try:
import networkx as nx
except ImportError:
nx = None
import pandas as pd
from datasets import Dataset
......
import ast
import json
import networkx as nx
try:
import networkx as nx
except ImportError:
nx = None
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
......
......@@ -3,6 +3,7 @@ from .afqmcd import * # noqa: F401, F403
from .agieval import * # noqa: F401, F403
from .anli import AnliDataset # noqa: F401, F403
from .anthropics_evals import * # noqa: F401, F403
from .apps import * # noqa: F401, F403
from .arc import * # noqa: F401, F403
from .ax import * # noqa: F401, F403
from .bbh import * # noqa: F401, F403
......@@ -94,6 +95,7 @@ from .summedits import * # noqa: F401, F403
from .summscreen import * # noqa: F401, F403
from .svamp import * # noqa: F401, F403
from .tabmwp import * # noqa: F401, F403
from .taco import * # noqa: F401, F403
from .teval import * # noqa: F401, F403
from .TheoremQA import * # noqa: F401, F403
from .tnews import * # noqa: F401, F403
......
......@@ -19,13 +19,19 @@ from unittest.mock import mock_open, patch
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from pyext import RuntimeModule
try:
from pyext import RuntimeModule
except ImportError:
RuntimeModule = None
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10
......@@ -67,18 +73,20 @@ class APPSDataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate
train_repeated = []
# train_repeated = []
test_repeated = []
for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats)
# for sample in new_dataset['train']:
# train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({
'train': dataset_train_repeated,
# 'train': dataset_train_repeated,
'test': dataset_test_repeated
})
......@@ -121,18 +129,20 @@ class APPS_miniDataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate
train_repeated = []
# train_repeated = []
test_repeated = []
for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats)
# for sample in new_dataset['train']:
# train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({
'train': dataset_train_repeated,
# 'train': dataset_train_repeated,
'test': dataset_test_repeated
})
......@@ -308,7 +318,10 @@ def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds
......
......@@ -210,6 +210,8 @@ def make_mcq_gen_config(meta):
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......@@ -255,6 +257,8 @@ def make_circular_mcq_gen_config(meta):
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......@@ -304,6 +308,8 @@ def make_qa_gen_config(meta):
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......@@ -353,6 +359,8 @@ def make_mcq_ppl_config(meta):
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......@@ -399,6 +407,8 @@ def make_circular_mcq_ppl_config(meta):
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
if 'test_range' in meta:
reader_cfg['test_range'] = meta['test_range']
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......
......@@ -168,9 +168,12 @@ def _clean_up_code(text: str, language_type: str, reference) -> str:
"""Cleans up the generated code."""
try:
# for chatGLM related text
text = eval(text)
eval_text = eval(text)
except Exception:
pass
else:
if isinstance(eval_text, str):
text = eval_text
# extract code from code block
text = text.lstrip('\n')
if '```' in text:
......
import re
import cn2an
"""
task: law article prediction
......@@ -15,6 +14,7 @@ def compute_ljp_article(data_dict):
A reference contains a list of articles of the Criminal Law of the People's Republic of China.
We compute the F1-score between the prediction and the reference.
"""
import cn2an
score_list, abstentions = [], 0
......
import math
import cn2an
import re
#法律判决预测-刑期预测
def compute_ljp_imprison(data_dict):
import cn2an
score_list, abstentions = [], 0
for example in data_dict:
......
......@@ -85,6 +85,7 @@ def normalize_final_answer(final_answer: str) -> str:
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r'(\\text\{)\((.*?)\)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer)
......@@ -178,10 +179,7 @@ class MATHEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
......@@ -457,8 +455,23 @@ class MATHEvaluator(BaseEvaluator):
ss2 = strip_string_func(str2)
if verbose:
print(ss1, ss2)
return ss1 == ss2
if ss1 == ss2:
return True
ss1 = normalize_final_answer(ss1)
ss2 = normalize_final_answer(ss2)
if ss1 == ss2:
return True
except Exception:
pass
try:
ss1 = normalize_final_answer(str1)
ss2 = normalize_final_answer(str2)
if ss1 == ss2:
return True
except Exception:
pass
return str1 == str2
......
......@@ -57,7 +57,7 @@ class MathBenchDataset(BaseDataset):
"""
data = []
filename = osp.join(path, f'{name}.jsonl')
with open(filename, 'r') as infile:
with open(filename, 'r', encoding='utf-8') as infile:
for id, line in enumerate(infile):
entry = json.loads(line)
if 'cloze' in name:
......
......@@ -244,6 +244,7 @@ class MBPPEvaluator(BaseEvaluator):
if not isinstance(preds, list):
preds = [preds]
for pred in preds:
pred = self._process_answer(pred)
mbpp_preds.append({'task_id': refer, 'solution': pred})
with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'mbpp_eval.jsonl')
......
......@@ -18,14 +18,20 @@ from io import StringIO
from unittest.mock import mock_open, patch
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from pyext import RuntimeModule
from datasets import Dataset, DatasetDict, load_from_disk
try:
from pyext import RuntimeModule
except ImportError:
RuntimeModule = None
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10
......@@ -34,7 +40,7 @@ class TACODataset(BaseDataset):
@staticmethod
def load(path: str, num_repeats: int = 1):
dataset = load_dataset(path)
dataset = load_from_disk(path)
new_dataset = DatasetDict()
# add new column "starter" in the prompt
for split in dataset.keys():
......@@ -69,18 +75,20 @@ class TACODataset(BaseDataset):
new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate
train_repeated = []
# train_repeated = []
test_repeated = []
for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats)
# for sample in new_dataset['train']:
# train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({
'train': dataset_train_repeated,
# 'train': dataset_train_repeated,
'test': dataset_test_repeated
})
......@@ -256,7 +264,10 @@ def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment