Unverified Commit e4830a69 authored by klein's avatar klein Committed by GitHub
Browse files

Update CIBench (#1089)



* modify the requirements/runtime.txt: numpy==1.23.4 --> numpy>=1.23.4

* update cibench: dataset and evluation

* cibench summarizer bug

* update cibench

* move extract_code import

---------
Co-authored-by: default avatarzhangchuyu@pjlab.org.cn <zhangchuyu@pjlab.org.cn>
Co-authored-by: default avatarLeymore <zfz-960727@163.com>
parent e404b72c
from mmengine.config import read_base
with read_base():
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403
...@@ -19,15 +19,14 @@ cibench_infer_cfg = dict( ...@@ -19,15 +19,14 @@ cibench_infer_cfg = dict(
inferencer=dict(type=AgentInferencer, infer_mode='every'), inferencer=dict(type=AgentInferencer, infer_mode='every'),
) )
libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT") cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [ cibench_datasets = [
dict( dict(
abbr=f"cibench_generation_{lib}", abbr=f"cibench_generation/{lib}",
type=CIBenchDataset, type=CIBenchDataset,
path=f"./data/cibench/{lib}", path=f"./data/cibench_dataset/cibench_generation/{lib}",
internet_check=False, internet_check=False,
reader_cfg=cibench_reader_cfg, reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg, infer_cfg=cibench_infer_cfg,
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
)
libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_generation_oracle/{lib}",
type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_generation/{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchTemplateDataset, CIBenchEvaluator from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict( cibench_reader_cfg = dict(
input_columns=["questions"], input_columns=["questions"],
...@@ -26,11 +26,10 @@ libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch', ...@@ -26,11 +26,10 @@ libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
'_chinese/opencv', '_chinese/pandas', '_chinese/pytorch', '_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
'_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow'] '_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT") cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [ cibench_datasets = [
dict( dict(
abbr=f"cibench_template{lib}", abbr=f"cibench_template{lib}",
type=CIBenchTemplateDataset, type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_template{lib}", path=f"./data/cibench_dataset/cibench_template{lib}",
internet_check=False, internet_check=False,
reader_cfg=cibench_reader_cfg, reader_cfg=cibench_reader_cfg,
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
)
# no tensorboard
libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
'/scipy', '/seaborn', '/sklearn', '/tensorflow',
'_chinese/lightgbm', '_chinese/matplotlib', '_chinese/nltk',
'_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
'_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_template_oracle{lib}",
type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_template{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]
...@@ -5,29 +5,58 @@ with read_base(): ...@@ -5,29 +5,58 @@ with read_base():
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'######## CIBench Generation ########', # category '######## CIBench Generation########', # category
['cibench', 'executable'], 'cibench_generation:tool_rate',
['cibench', 'general_correct'], 'cibench_generation:executable',
['cibench', 'vis_sim'], 'cibench_generation:numeric_correct',
'cibench_generation:text_score',
'cibench_generation:vis_sim',
'######## CIBench Generation Oracle########', # category
'cibench_generation_oracle:tool_rate',
'cibench_generation_oracle:executable',
'cibench_generation_oracle:numeric_correct',
'cibench_generation_oracle:text_score',
'cibench_generation_oracle:vis_sim',
'######## CIBench Template ########', # category '######## CIBench Template ########', # category
'cibench_template:tool_rate',
'cibench_template:executable', 'cibench_template:executable',
'cibench_template:numeric_correct', 'cibench_template:numeric_correct',
'cibench_template:text_score', 'cibench_template:text_score',
'cibench_template:vis_sim', 'cibench_template:vis_sim',
'######## CIBench Template Oracle########', # category
'cibench_template_oracle:tool_rate',
'cibench_template_oracle:executable',
'cibench_template_oracle:numeric_correct',
'cibench_template_oracle:text_score',
'cibench_template_oracle:vis_sim',
'######## CIBench Template Chinese ########', # category '######## CIBench Template Chinese ########', # category
'cibench_template_cn:tool_rate',
'cibench_template_cn:executable', 'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct', 'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score', 'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim', 'cibench_template_cn:vis_sim',
'######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk '######## CIBench Template Chinese Oracle########', # category
'cibench_template_wo_nltk:executable', 'cibench_template_cn_oracle:tool_rate',
'cibench_template_wo_nltk:numeric_correct', 'cibench_template_cn_oracle:executable',
'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_oracle:numeric_correct',
'######## CIBench Template Chinese w/o NLTK ########', # category 'cibench_template_cn_oracle:text_score',
'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_oracle:vis_sim',
'cibench_template_cn_wo_nltk:numeric_correct', '######## CIBench Category Metric ########',
'cibench_template_cn_wo_nltk:vis_sim', 'cibench_data_manipulation:scores',
'cibench_data_visualization:scores',
'cibench_modeling:scores',
'cibench_nlp:scores',
'cibench_ip:scores',
'cibench_math:scores',
'######## CIBench Category Metric Oracle ########',
'cibench_data_manipulation_oracle:scores',
'cibench_data_visualization_oracle:scores',
'cibench_modeling_oracle:scores',
'cibench_nlp_oracle:scores',
'cibench_ip_oracle:scores',
'cibench_math_oracle:scores',
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []) [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
) )
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
...@@ -144,6 +144,7 @@ class IPythonInterpreter(BaseAction): ...@@ -144,6 +144,7 @@ class IPythonInterpreter(BaseAction):
def _inner_call(): def _inner_call():
result = '' result = ''
image_path = ''
succeed = True succeed = True
image_idx = 0 image_idx = 0
...@@ -197,7 +198,7 @@ class IPythonInterpreter(BaseAction): ...@@ -197,7 +198,7 @@ class IPythonInterpreter(BaseAction):
if text: if text:
result += f'\n\n{msg_type}:\n\n```\n{text}\n```' result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
if image: if image:
result += f'\n\n{image}' image_path += f'\n\n{image}'
if finished: if finished:
# in case output text too long # in case output text too long
# might need better design later # might need better design later
...@@ -205,7 +206,7 @@ class IPythonInterpreter(BaseAction): ...@@ -205,7 +206,7 @@ class IPythonInterpreter(BaseAction):
ellip = '......' ellip = '......'
half_len = int((self.trim_output - len(ellip)) / 2) half_len = int((self.trim_output - len(ellip)) / 2)
result = result[:half_len] + ellip + result[-half_len:] result = result[:half_len] + ellip + result[-half_len:]
return succeed, result return succeed, result, image_path
try: try:
if timeout: if timeout:
...@@ -215,7 +216,7 @@ class IPythonInterpreter(BaseAction): ...@@ -215,7 +216,7 @@ class IPythonInterpreter(BaseAction):
signal.signal(signal.SIGALRM, handler) signal.signal(signal.SIGALRM, handler)
signal.alarm(timeout) signal.alarm(timeout)
succeed, result = _inner_call() succeed, result, image_path = _inner_call()
except TimeoutError: except TimeoutError:
succeed = False succeed = False
text = 'The code interpreter encountered an unexpected error.' text = 'The code interpreter encountered an unexpected error.'
...@@ -225,7 +226,8 @@ class IPythonInterpreter(BaseAction): ...@@ -225,7 +226,8 @@ class IPythonInterpreter(BaseAction):
signal.alarm(0) signal.alarm(0)
result = result.lstrip('\n') result = result.lstrip('\n')
return succeed, result image_path = image_path.lstrip('\n')
return succeed, result, image_path
def __call__(self, def __call__(self,
command: str, command: str,
...@@ -234,11 +236,12 @@ class IPythonInterpreter(BaseAction): ...@@ -234,11 +236,12 @@ class IPythonInterpreter(BaseAction):
extracted_command = extract_code(command) extracted_command = extract_code(command)
tool_return.args = dict(text=command, extract_code=extracted_command) tool_return.args = dict(text=command, extract_code=extracted_command)
if extracted_command: if extracted_command:
succeed, result = self._call(extracted_command, timeout) succeed, result, image_path = self._call(extracted_command,
timeout)
if succeed: if succeed:
if not result: if not result:
result = 'The code is succeed without any outputs.' result = 'The code is succeed without any outputs.'
tool_return.result = dict(text=result) tool_return.result = dict(text=result, image_path=image_path)
tool_return.state = ActionStatusCode.SUCCESS tool_return.state = ActionStatusCode.SUCCESS
else: else:
tool_return.errmsg = repr(result) tool_return.errmsg = repr(result)
......
...@@ -44,9 +44,10 @@ class LagentAgent: ...@@ -44,9 +44,10 @@ class LagentAgent:
def gt_response(self, prompt): def gt_response(self, prompt):
if 'CIReAct' in str(self.agent.__class__): if 'CIReAct' in str(self.agent.__class__):
gold = prompt thought, gold = prompt.split('**split**')
prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter prompt = f"""{self.agent._protocol.thought['begin']} {thought}\
{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n""" # noqa \n{self.agent._protocol.action['begin']} IPythonInterpreter\n\
{self.agent._protocol.action_input['begin']}```python\n{gold}\n```\n""" # noqa
action_input = dict( action_input = dict(
command=f"""```python\n{gold}\n```\n""", command=f"""```python\n{gold}\n```\n""",
timeout=120, timeout=120,
......
...@@ -17,7 +17,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg, ...@@ -17,7 +17,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
from opencompass.utils.prompt import get_prompt_hash from opencompass.utils.prompt import get_prompt_hash
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match'] METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'tool_rate'] METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
def model_abbr_from_cfg_used_in_summarizer(model): def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None): if model.get('summarizer_abbr', None):
......
...@@ -5,7 +5,7 @@ json5 ...@@ -5,7 +5,7 @@ json5
jupyter jupyter
jupyter_client jupyter_client
jupytext jupytext
lagent lagent==0.1.2
lightgbm==4.1.0 lightgbm==4.1.0
networkx networkx
scikit-image scikit-image
......
...@@ -11,6 +11,7 @@ func_timeout ...@@ -11,6 +11,7 @@ func_timeout
fuzzywuzzy fuzzywuzzy
immutabledict immutabledict
jieba jieba
json5
langdetect langdetect
ltp ltp
mmengine-lite mmengine-lite
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment