Unverified Commit e4830a69 authored by klein's avatar klein Committed by GitHub
Browse files

Update CIBench (#1089)



* modify the requirements/runtime.txt: numpy==1.23.4 --> numpy>=1.23.4

* update cibench: dataset and evluation

* cibench summarizer bug

* update cibench

* move extract_code import

---------
Co-authored-by: default avatarzhangchuyu@pjlab.org.cn <zhangchuyu@pjlab.org.cn>
Co-authored-by: default avatarLeymore <zfz-960727@163.com>
parent e404b72c
from mmengine.config import read_base
with read_base():
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403
......@@ -19,15 +19,14 @@ cibench_infer_cfg = dict(
inferencer=dict(type=AgentInferencer, infer_mode='every'),
)
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_generation_{lib}",
abbr=f"cibench_generation/{lib}",
type=CIBenchDataset,
path=f"./data/cibench/{lib}",
path=f"./data/cibench_dataset/cibench_generation/{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
)
libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_generation_oracle/{lib}",
type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_generation/{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]
......@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchTemplateDataset, CIBenchEvaluator
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
......@@ -26,11 +26,10 @@ libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
'_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
'_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_template{lib}",
type=CIBenchTemplateDataset,
type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_template{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
)
# no tensorboard
libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
'/scipy', '/seaborn', '/sklearn', '/tensorflow',
'_chinese/lightgbm', '_chinese/matplotlib', '_chinese/nltk',
'_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
'_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_template_oracle{lib}",
type=CIBenchDataset,
path=f"./data/cibench_dataset/cibench_template{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]
......@@ -5,29 +5,58 @@ with read_base():
summarizer = dict(
dataset_abbrs=[
'######## CIBench Generation ########', # category
['cibench', 'executable'],
['cibench', 'general_correct'],
['cibench', 'vis_sim'],
'######## CIBench Generation########', # category
'cibench_generation:tool_rate',
'cibench_generation:executable',
'cibench_generation:numeric_correct',
'cibench_generation:text_score',
'cibench_generation:vis_sim',
'######## CIBench Generation Oracle########', # category
'cibench_generation_oracle:tool_rate',
'cibench_generation_oracle:executable',
'cibench_generation_oracle:numeric_correct',
'cibench_generation_oracle:text_score',
'cibench_generation_oracle:vis_sim',
'######## CIBench Template ########', # category
'cibench_template:tool_rate',
'cibench_template:executable',
'cibench_template:numeric_correct',
'cibench_template:text_score',
'cibench_template:vis_sim',
'######## CIBench Template Oracle########', # category
'cibench_template_oracle:tool_rate',
'cibench_template_oracle:executable',
'cibench_template_oracle:numeric_correct',
'cibench_template_oracle:text_score',
'cibench_template_oracle:vis_sim',
'######## CIBench Template Chinese ########', # category
'cibench_template_cn:tool_rate',
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim',
'######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
'######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
'######## CIBench Template Chinese Oracle########', # category
'cibench_template_cn_oracle:tool_rate',
'cibench_template_cn_oracle:executable',
'cibench_template_cn_oracle:numeric_correct',
'cibench_template_cn_oracle:text_score',
'cibench_template_cn_oracle:vis_sim',
'######## CIBench Category Metric ########',
'cibench_data_manipulation:scores',
'cibench_data_visualization:scores',
'cibench_modeling:scores',
'cibench_nlp:scores',
'cibench_ip:scores',
'cibench_math:scores',
'######## CIBench Category Metric Oracle ########',
'cibench_data_manipulation_oracle:scores',
'cibench_data_visualization_oracle:scores',
'cibench_modeling_oracle:scores',
'cibench_nlp_oracle:scores',
'cibench_ip_oracle:scores',
'cibench_math_oracle:scores',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -144,6 +144,7 @@ class IPythonInterpreter(BaseAction):
def _inner_call():
result = ''
image_path = ''
succeed = True
image_idx = 0
......@@ -197,7 +198,7 @@ class IPythonInterpreter(BaseAction):
if text:
result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
if image:
result += f'\n\n{image}'
image_path += f'\n\n{image}'
if finished:
# in case output text too long
# might need better design later
......@@ -205,7 +206,7 @@ class IPythonInterpreter(BaseAction):
ellip = '......'
half_len = int((self.trim_output - len(ellip)) / 2)
result = result[:half_len] + ellip + result[-half_len:]
return succeed, result
return succeed, result, image_path
try:
if timeout:
......@@ -215,7 +216,7 @@ class IPythonInterpreter(BaseAction):
signal.signal(signal.SIGALRM, handler)
signal.alarm(timeout)
succeed, result = _inner_call()
succeed, result, image_path = _inner_call()
except TimeoutError:
succeed = False
text = 'The code interpreter encountered an unexpected error.'
......@@ -225,7 +226,8 @@ class IPythonInterpreter(BaseAction):
signal.alarm(0)
result = result.lstrip('\n')
return succeed, result
image_path = image_path.lstrip('\n')
return succeed, result, image_path
def __call__(self,
command: str,
......@@ -234,11 +236,12 @@ class IPythonInterpreter(BaseAction):
extracted_command = extract_code(command)
tool_return.args = dict(text=command, extract_code=extracted_command)
if extracted_command:
succeed, result = self._call(extracted_command, timeout)
succeed, result, image_path = self._call(extracted_command,
timeout)
if succeed:
if not result:
result = 'The code is succeed without any outputs.'
tool_return.result = dict(text=result)
tool_return.result = dict(text=result, image_path=image_path)
tool_return.state = ActionStatusCode.SUCCESS
else:
tool_return.errmsg = repr(result)
......
......@@ -44,9 +44,10 @@ class LagentAgent:
def gt_response(self, prompt):
if 'CIReAct' in str(self.agent.__class__):
gold = prompt
prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n""" # noqa
thought, gold = prompt.split('**split**')
prompt = f"""{self.agent._protocol.thought['begin']} {thought}\
\n{self.agent._protocol.action['begin']} IPythonInterpreter\n\
{self.agent._protocol.action_input['begin']}```python\n{gold}\n```\n""" # noqa
action_input = dict(
command=f"""```python\n{gold}\n```\n""",
timeout=120,
......
......@@ -17,7 +17,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
from opencompass.utils.prompt import get_prompt_hash
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'tool_rate']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
......
......@@ -5,7 +5,7 @@ json5
jupyter
jupyter_client
jupytext
lagent
lagent==0.1.2
lightgbm==4.1.0
networkx
scikit-image
......
......@@ -11,6 +11,7 @@ func_timeout
fuzzywuzzy
immutabledict
jieba
json5
langdetect
ltp
mmengine-lite
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment