Update CIBench (#1089)

* modify the requirements/runtime.txt: numpy==1.23.4 --> numpy>=1.23.4 * update cibench: dataset and evluation * cibench summarizer bug * update cibench * move extract_code import --------- Co-authored-by: zhangchuyu@pjlab.org.cn <zhangchuyu@pjlab.org.cn> Co-authored-by: Leymore <zfz-960727@163.com>

Update CIBench (#1089)
* modify the requirements/runtime.txt: numpy==1.23.4 --> numpy>=1.23.4 * update cibench: dataset and evluation * cibench summarizer bug * update cibench * move extract_code import --------- Co-authored-by: zhangchuyu@pjlab.org.cn <zhangchuyu@pjlab.org.cn> Co-authored-by: Leymore <zfz-960727@163.com>
e4830a69 · klein · GitHub · e404b72c · e404b72c · e4830a69
Unverified Commit e4830a69 authored Apr 26, 2024 by klein Committed by GitHub Apr 26, 2024
13 changed files
--- a/configs/datasets/CIBench/CIBench_gen.py
+++ b/configs/datasets/CIBench/CIBench_gen.py
-from mmengine.config import read_base
-with read_base():
-    from .CIBench_gen_8ab0dc import ci_datasets  # noqa: F401, F403
--- a/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
+++ b/configs/datasets/CIBench/CIBench_gen_8ab0dc.py
@@ -19,15 +19,14 @@ cibench_infer_cfg = dict(
    inferencer=dict(type=AgentInferencer, infer_mode='every'),
 )
+libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
-libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
 cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
 cibench_datasets = [
    dict(
-        abbr=f"cibench_generation_{lib}",
+        abbr=f"cibench_generation/{lib}",
        type=CIBenchDataset,
-        path=f"./data/cibench/{lib}",
+        path=f"./data/cibench_dataset/cibench_generation/{lib}",
        internet_check=False,
        reader_cfg=cibench_reader_cfg,
        infer_cfg=cibench_infer_cfg,

--- a/configs/datasets/CIBench/CIBench_generation_oracle_gen_c4a7c1.py
+++ b/configs/datasets/CIBench/CIBench_generation_oracle_gen_c4a7c1.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+cibench_reader_cfg = dict(
+    input_columns=["questions"],
+    output_column="references",
+    train_split='test',
+    test_split='test')
+cibench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{questions}""",
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
+)
+libs = ['matplotlib', 'opencv', 'pandas', 'pytorch', 'scipy', 'seaborn']
+cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
+cibench_datasets = [
+    dict(
+        abbr=f"cibench_generation_oracle/{lib}",
+        type=CIBenchDataset,
+        path=f"./data/cibench_dataset/cibench_generation/{lib}",
+        internet_check=False,
+        reader_cfg=cibench_reader_cfg,
+        infer_cfg=cibench_infer_cfg,
+        eval_cfg=cibench_eval_cfg,
+    ) for lib in libs
+]
--- a/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
+++ b/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import AgentInferencer
-from opencompass.datasets import CIBenchTemplateDataset, CIBenchEvaluator
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
 cibench_reader_cfg = dict(
    input_columns=["questions"],
@@ -26,11 +26,10 @@ libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
    '_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
    '_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
 cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
 cibench_datasets = [
    dict(
        abbr=f"cibench_template{lib}",
-        type=CIBenchTemplateDataset,
+        type=CIBenchDataset,
        path=f"./data/cibench_dataset/cibench_template{lib}",
        internet_check=False,
        reader_cfg=cibench_reader_cfg,

--- a/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
+++ b/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+cibench_reader_cfg = dict(
+    input_columns=["questions"],
+    output_column="references",
+    train_split='test',
+    test_split='test')
+cibench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{questions}""",
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=AgentInferencer, infer_mode='every_with_gt'),
+)
+# no tensorboard
+libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
+    '/scipy', '/seaborn', '/sklearn', '/tensorflow',
+    '_chinese/lightgbm', '_chinese/matplotlib', '_chinese/nltk',
+    '_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
+    '_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
+cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
+cibench_datasets = [
+    dict(
+        abbr=f"cibench_template_oracle{lib}",
+        type=CIBenchDataset,
+        path=f"./data/cibench_dataset/cibench_template{lib}",
+        internet_check=False,
+        reader_cfg=cibench_reader_cfg,
+        infer_cfg=cibench_infer_cfg,
+        eval_cfg=cibench_eval_cfg,
+    ) for lib in libs
+]
--- a/configs/summarizers/cibench.py
+++ b/configs/summarizers/cibench.py
@@ -5,29 +5,58 @@ with read_base():
 summarizer = dict(
    dataset_abbrs=[
-        '######## CIBench Generation ########', # category
+        '######## CIBench Generation########', # category
-        ['cibench', 'executable'],
+        'cibench_generation:tool_rate',
-        ['cibench', 'general_correct'],
+        'cibench_generation:executable',
-        ['cibench', 'vis_sim'],
+        'cibench_generation:numeric_correct',
+        'cibench_generation:text_score',
+        'cibench_generation:vis_sim',
+        '######## CIBench Generation Oracle########', # category
+        'cibench_generation_oracle:tool_rate',
+        'cibench_generation_oracle:executable',
+        'cibench_generation_oracle:numeric_correct',
+        'cibench_generation_oracle:text_score',
+        'cibench_generation_oracle:vis_sim',
        '######## CIBench Template ########', # category
+        'cibench_template:tool_rate',
        'cibench_template:executable',
        'cibench_template:numeric_correct',
        'cibench_template:text_score',
        'cibench_template:vis_sim',
+        '######## CIBench Template Oracle########', # category
+        'cibench_template_oracle:tool_rate',
+        'cibench_template_oracle:executable',
+        'cibench_template_oracle:numeric_correct',
+        'cibench_template_oracle:text_score',
+        'cibench_template_oracle:vis_sim',
        '######## CIBench Template Chinese ########', # category
+        'cibench_template_cn:tool_rate',
        'cibench_template_cn:executable',
        'cibench_template_cn:numeric_correct',
        'cibench_template_cn:text_score',
        'cibench_template_cn:vis_sim',
-        '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk 
+        '######## CIBench Template Chinese Oracle########', # category
-        'cibench_template_wo_nltk:executable',
+        'cibench_template_cn_oracle:tool_rate',
-        'cibench_template_wo_nltk:numeric_correct',
+        'cibench_template_cn_oracle:executable',
-        'cibench_template_wo_nltk:vis_sim',
+        'cibench_template_cn_oracle:numeric_correct',
-        '######## CIBench Template Chinese w/o NLTK ########', # category
+        'cibench_template_cn_oracle:text_score',
-        'cibench_template_cn_wo_nltk:executable',
+        'cibench_template_cn_oracle:vis_sim',
-        'cibench_template_cn_wo_nltk:numeric_correct',
+        '######## CIBench Category Metric ########', 
-        'cibench_template_cn_wo_nltk:vis_sim',
+        'cibench_data_manipulation:scores',
+        'cibench_data_visualization:scores',
+        'cibench_modeling:scores',
+        'cibench_nlp:scores',
+        'cibench_ip:scores',
+        'cibench_math:scores',
+        '######## CIBench Category Metric Oracle ########', 
+        'cibench_data_manipulation_oracle:scores',
+        'cibench_data_visualization_oracle:scores',
+        'cibench_modeling_oracle:scores',
+        'cibench_nlp_oracle:scores',
+        'cibench_ip_oracle:scores',
+        'cibench_math_oracle:scores',
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
 )
\ No newline at end of file
--- a/configs/summarizers/groups/cibench.py
+++ b/configs/summarizers/groups/cibench.py
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
@@ -144,6 +144,7 @@ class IPythonInterpreter(BaseAction):
        def _inner_call():
            result = ''
+            image_path = ''
            succeed = True
            image_idx = 0
@@ -197,7 +198,7 @@ class IPythonInterpreter(BaseAction):
                if text:
                    result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
                if image:
-                    result += f'\n\n{image}'
+                    image_path += f'\n\n{image}'
                if finished:
                    # in case output text too long
                    # might need better design later
@@ -205,7 +206,7 @@ class IPythonInterpreter(BaseAction):
                        ellip = '......'
                        half_len = int((self.trim_output - len(ellip)) / 2)
                        result = result[:half_len] + ellip + result[-half_len:]
-                    return succeed, result
+                    return succeed, result, image_path
        try:
            if timeout:
@@ -215,7 +216,7 @@ class IPythonInterpreter(BaseAction):
                signal.signal(signal.SIGALRM, handler)
                signal.alarm(timeout)
-            succeed, result = _inner_call()
+            succeed, result, image_path = _inner_call()
        except TimeoutError:
            succeed = False
            text = 'The code interpreter encountered an unexpected error.'
@@ -225,7 +226,8 @@ class IPythonInterpreter(BaseAction):
                signal.alarm(0)
        result = result.lstrip('\n')
-        return succeed, result
+        image_path = image_path.lstrip('\n')
+        return succeed, result, image_path
    def __call__(self,
                 command: str,
@@ -234,11 +236,12 @@ class IPythonInterpreter(BaseAction):
        extracted_command = extract_code(command)
        tool_return.args = dict(text=command, extract_code=extracted_command)
        if extracted_command:
-            succeed, result = self._call(extracted_command, timeout)
+            succeed, result, image_path = self._call(extracted_command,
+                                                     timeout)
            if succeed:
                if not result:
                    result = 'The code is succeed without any outputs.'
-                tool_return.result = dict(text=result)
+                tool_return.result = dict(text=result, image_path=image_path)
                tool_return.state = ActionStatusCode.SUCCESS
            else:
                tool_return.errmsg = repr(result)

--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@@ -44,9 +44,10 @@ class LagentAgent:
    def gt_response(self, prompt):
        if 'CIReAct' in str(self.agent.__class__):
-            gold = prompt
+            thought, gold = prompt.split('**split**')
-            prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
+            prompt = f"""{self.agent._protocol.thought['begin']} {thought}\
-{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n"""  # noqa
+\n{self.agent._protocol.action['begin']} IPythonInterpreter\n\
+{self.agent._protocol.action_input['begin']}```python\n{gold}\n```\n"""  # noqa
            action_input = dict(
                command=f"""```python\n{gold}\n```\n""",
                timeout=120,

--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -17,7 +17,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
 from opencompass.utils.prompt import get_prompt_hash
 METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match']
-METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'tool_rate']
+METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
 def model_abbr_from_cfg_used_in_summarizer(model):
    if model.get('summarizer_abbr', None):

--- a/requirements/agent.txt
+++ b/requirements/agent.txt
@@ -5,7 +5,7 @@ json5
 jupyter
 jupyter_client
 jupytext
-lagent
+lagent==0.1.2
 lightgbm==4.1.0
 networkx
 scikit-image

--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -11,6 +11,7 @@ func_timeout
 fuzzywuzzy
 immutabledict
 jieba
+json5
 langdetect
 ltp
 mmengine-lite