[Feat] Support cibench (#538)

* [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix

[Feat] Support cibench (#538)
* [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix
bb2ecf41 · Hubert · GitHub · 36360bdf · bb2ecf41 · bb2ecf41
Unverified Commit bb2ecf41 authored Nov 07, 2023 by Hubert Committed by GitHub Nov 07, 2023
18 changed files
--- a/configs/datasets/CIBench/CIBench_gen.py
+++ b/configs/datasets/CIBench/CIBench_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .CIBench_gen_eb42f9 import ci_datasets  # noqa: F401, F403
--- a/configs/datasets/CIBench/CIBench_gen_eb42f9.py
+++ b/configs/datasets/CIBench/CIBench_gen_eb42f9.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+cibench_reader_cfg = dict(
+    input_columns=["questions"],
+    output_column="references",
+    train_split='test',
+    test_split='test')
+cibench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{questions}""",
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=AgentInferencer),
+)
+libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
+cibench_eval_cfg = {
+    lib: dict(
+        evaluator=dict(
+            type=CIBenchEvaluator,
+            output_dir=f'output_data/cibench/{lib}'),
+        pred_role="BOT",
+    )
+    for lib in libs
+}
+cibench_datasets = [
+    dict(
+        abbr=f"cibench_{lib}",
+        type=CIBenchDataset,
+        path=f"./data/cibench/{lib}",
+        reader_cfg=cibench_reader_cfg,
+        infer_cfg=cibench_infer_cfg,
+        eval_cfg=cibench_eval_cfg[lib],
+    ) for lib in libs
+]
--- a/configs/datasets/ds1000/ds1000_gen_5c4bec.py
+++ b/configs/datasets/ds1000/ds1000_gen_5c4bec.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import DS1000Dataset_Interperter, DS1000InterpreterEvaluator
+ds1000_example = """
+In the following task, you should generate code with one assertion to testify the correctness of your code.
+Example:
+<HUMAN>Problem:
+How do I get the dimensions of an array? For instance, this is (2, 2):
+a = np.array([[1,2],[3,4]])
+<ASSISTANT>{thought} In Python, Numpy provides a method called `shape` which helps to get the dimensions of an array.
+{action} PythonInterpreter
+{action_input}
+```python
+import numpy as np
+def solution(x):
+    # Convert to np.ndarray
+    x = np.array(x)
+    # Getting the dimensions of the array
+    dimensions = x.shape
+    return dimensions
+assert solution([[1,2],[3,4]]) == (2, 2)
+```
+<SYSTEM>{response}True
+<ASSISTANT> {thought} By running this code, you can get the dimensions of an array.
+{finish}
+```python
+import numpy as np
+def solution(x):
+    # Convert to np.ndarray
+    x = np.array(x)
+    # Getting the dimensions of the array
+    dimensions = x.shape
+    return dimensions
+```
+"""
+ds1000_reader_cfg = dict(
+    input_columns=["prompt"],
+    output_column="test_column",
+    train_split="test",
+    test_split="test",
+)
+ds1000_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{prompt}""",
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=AgentInferencer, example=ds1000_example),
+)
+ds1000_eval_cfg = dict(
+    evaluator=dict(type=DS1000InterpreterEvaluator),
+    pred_role="BOT",
+)
+# The DS-1000 dataset can be downloaded from
+# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
+# Matplotlib cannot fit this setting
+ds1000_datasets = [
+    dict(
+        abbr=f"ds1000_{lib}",
+        type=DS1000Dataset_Interperter,  # bustm share the same format with AFQMC
+        path="./data/ds1000_data/",
+        libs=f"{lib}",
+        reader_cfg=ds1000_reader_cfg,
+        infer_cfg=ds1000_infer_cfg,
+        eval_cfg=ds1000_eval_cfg,
+    )
+    for lib in [
+        "Pandas",
+        "Numpy",
+        # 'Tensorflow',  # error using tensorflow, skipped temporarily
+        "Scipy",
+        "Sklearn",
+        "Pytorch",
+    ]
+]
--- a/configs/datasets/gsm8k/gsm8k_gen_e7ef64.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_e7ef64.py
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import AgentInferencer
-from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kAgentEvaluator
-from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
 # This config is for code interpreter
 gsm8k_example = """
+Example:
 <HUMAN>A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?
 <ASSISTANT>{thought} We need to calculate the total number of fruits. The total number of fruits in the first three baskets is given, while for the fourth basket, we need to subtract 2 from each fruit category. We can solve this problem using simple arithmetic.
 {action} PythonInterpreter
@@ -68,7 +69,7 @@ gsm8k_infer_cfg = dict(
    inferencer=dict(type=AgentInferencer, example=gsm8k_example))
 gsm8k_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
+    evaluator=dict(type=Gsm8kAgentEvaluator),
    pred_postprocessor=dict(type=gsm8k_postprocess),
    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))

--- a/configs/datasets/math/math_gen_6cca30.py
+++ b/configs/datasets/math/math_gen_6cca30.py
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import AgentInferencer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
+from opencompass.datasets import MATHDataset, MATHAgentEvaluator, math_postprocess
 # This config is for code interpreter
 math_example = """
+Example:
 <HUMAN>Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.
 <ASSISTANT>{thought} The domain restrictions are determined by:
@@ -45,7 +47,7 @@ math_infer_cfg = dict(
    inferencer=dict(type=AgentInferencer, example=math_example))
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHAgentEvaluator),
    pred_postprocessor=dict(type=math_postprocess))
 math_datasets = [

--- a/configs/eval_cibench.py
+++ b/configs/eval_cibench.py
+from mmengine.config import read_base
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner, SlurmRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.models import OpenAI
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.agents.react import CIReAct
+from opencompass.models.lagent import CodeAgent
+from lagent.agents.react import ReActProtocol
+with read_base():
+    from .datasets.CIBench.CIBench_gen_eb42f9 import cibench_datasets as datasets
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+Begin!
+"""
+models = [
+    dict(
+        abbr='gpt-3.5-turbo',
+        type=CodeAgent,
+        agent_type=CIReAct,
+        mutli_rounds=True,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-3.5-turbo',
+            key='ENV',
+            query_per_second=1,
+            max_seq_len=4096,
+        ),
+        actions=[
+            dict(
+                type=IPythonInterpreter,
+                description=
+                '''It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.
+'''),
+        ],
+        protocol=dict(
+            type=ReActProtocol,
+            call_protocol=FEWSHOT_INSTRUCTION,
+            force_stop=FORCE_STOP_PROMPT_EN,
+            action=dict(role='ACTION', begin='Tool:', end='\n'),
+            action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
+            response=dict(role='RESPONSE', begin='Tool Response:', end='\n'),
+            finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+        ),
+        batch_size=8,
+    ),
+]
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=50, gen_task_coef=1),
+    runner=dict(
+        type=SlurmRunner, max_num_workers=8, retry=2, 
+        task=dict(type=OpenICLInferTask)),
+)
--- a/configs/eval_codeagent.py
+++ b/configs/eval_codeagent.py
@@ -6,8 +6,8 @@ from opencompass.models import OpenAI, HuggingFaceCausalLM
 from opencompass.models.lagent import CodeAgent
 with read_base():
-    from .datasets.math.math_gen_6cca30 import math_datasets
+    from .datasets.math.math_gen_943d32 import math_datasets
-    from .datasets.gsm8k.gsm8k_gen_e7ef64 import gsm8k_datasets
+    from .datasets.gsm8k.gsm8k_gen_57b0b1 import gsm8k_datasets
 datasets = []
 datasets += gsm8k_datasets

--- a/configs/eval_ds1000_interpreter.py
+++ b/configs/eval_ds1000_interpreter.py
+from mmengine.config import read_base
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.models import OpenAI
+from opencompass.models.lagent import CodeAgent
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+PYTHON_INTERPRETER_DESCRIPTION = """\
+It can run a Python code. The code must be a valid code that contains only python method.
+"""
+actions = [
+    dict(
+        type=PythonInterpreter,
+        description=PYTHON_INTERPRETER_DESCRIPTION,
+        answer_expr=None,
+    )
+]
+with read_base():
+    from .datasets.ds1000.ds1000_gen_5c4bec import ds1000_datasets as datasets
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=CodeAgent,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-3.5-turbo',
+            key='ENV',
+            query_per_second=1,
+            max_seq_len=4096,
+        ),
+        actions=actions,
+        batch_size=8),
+]
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(
+        type=LocalRunner, max_num_workers=16,
+        task=dict(type=OpenICLInferTask)),
+)
\ No newline at end of file
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -12,6 +12,7 @@ from .c3 import *  # noqa: F401, F403
 from .cb import *  # noqa: F401, F403
 from .ceval import *  # noqa: F401, F403
 from .chid import *  # noqa: F401, F403
+from .cibench import *  # noqa: F401, F403
 from .civilcomments import *  # noqa: F401, F403
 from .cluewsc import *  # noqa: F401, F403
 from .cmb import *  # noqa: F401, F403
@@ -26,6 +27,7 @@ from .cvalues import *  # noqa: F401, F403
 from .drcd import *  # noqa: F401, F403
 from .drop import *  # noqa: F401, F403
 from .ds1000 import *  # noqa: F401, F403
+from .ds1000_interpreter import *  # noqa: F401, F403
 from .eprstmt import *  # noqa: F401, F403
 from .flores import *  # noqa: F401, F403
 from .game24 import *  # noqa: F401, F403

--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
+import json
+import os
+import os.path as osp
+import re
+from typing import List, Optional
+import numpy as np
+from datasets import Dataset
+from opencompass.lagent.actions.ipython_interpreter import extract_code
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import LOAD_DATASET
+from .base import BaseDataset
+def load_experiment(file: str) -> dict:
+    """Load single experiment file with solutions."""
+    with open(file, 'r') as f:
+        notebook = json.load(f)
+        example = notebook['cells']
+        questions = []
+        outputs = []
+        tags = []
+        for cell in example:
+            if cell['cell_type'] == 'markdown':
+                text = ''.join(cell['source'])
+                # append the formatted text
+                questions.append(text)
+            elif cell['cell_type'] == 'code':
+                if cell['outputs'] and 'data' in cell['outputs'][-1]:
+                    if 'image/png' in cell['outputs'][-1]['data']:
+                        # skip vis temporarily due to lack of evaluation
+                        tags.append('vis')
+                        outputs.append(
+                            cell['outputs'][-1]['data']['image/png'])
+                    elif 'text/plain' in cell['outputs'][-1]['data']:
+                        tags.append('general')
+                        outputs.append(''.join(
+                            cell['outputs'][-1]['data']['text/plain']))
+                else:
+                    tags.append('executable')
+                    outputs.append(None)
+    return dict(
+        experiment=file,
+        questions=questions,
+        references=dict(outputs=outputs, tags=tags, experiment=file),
+    )
+@LOAD_DATASET.register_module()
+class CIBenchDataset(BaseDataset):
+    """Code Interpreter dataset."""
+    @staticmethod
+    def load(path: str):
+        """Load whole dataset."""
+        data_list = []
+        for cwd, dirs, files in os.walk(path):
+            dirs.sort()
+            files.sort()
+            for f in files:
+                if '.ipynb' in f:
+                    try:
+                        data = load_experiment(os.path.join(cwd, f))
+                    except Exception:
+                        print(f'Error with file {os.path.join(cwd, f)}')
+                        continue
+                    data_list.append(data)
+        dataset = Dataset.from_list(data_list)
+        return dataset
+class CIBenchEvaluator(BaseEvaluator):
+    """Evaluator for CI dataset.
+    Args:
+        output_dir (optional, str): The directory to save experiment
+            files in a markdown or notebook format.
+        user_data_dir (str): The directory to load local files.
+            Defaults to 'ENV', which means use environment variable
+            `USER_DATA_DIR` to get the data dir.
+    """
+    def __init__(self,
+                 output_dir: Optional[str] = None,
+                 user_data_dir: str = 'ENV') -> None:
+        # TODO: should use work dir for this task.
+        self.output_dir = output_dir
+        if user_data_dir == 'ENV':
+            user_data_dir = os.environ.get('USER_DATA_DIR', '')
+        self.user_data_dir = user_data_dir
+    @staticmethod
+    def valid_step(step):
+        """Whether the step is executable and valid."""
+        # Found the latest code interpreter to determine valid
+        for action in step[::-1]:
+            if action['type'] == 'IPythonInterpreter':
+                if action['errmsg']:
+                    return False
+                else:
+                    return True
+        # No code interpreter for this step, reckon as False
+        return False
+    @staticmethod
+    def correct_step(step, target):
+        """Whether the step output is correct."""
+        # Found the latest code interpreter to determine correct
+        for action in step[::-1]:
+            if action['type'] == 'IPythonInterpreter':
+                if action['result']:
+                    try:
+                        pred = action['result']['text']
+                        match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
+                        if match:
+                            out = match.group(1)
+                            return out == target or out in target
+                    except Exception:
+                        return False
+        # Fall back to False
+        return False
+    @staticmethod
+    def vis_similarity_step(step, target):
+        """Whether the step output image has the same structure similarity with
+        the given images."""
+        # Found the latest code interpreter to determine correct
+        import base64
+        import skimage
+        for action in step[::-1]:
+            if action['type'] == 'IPythonInterpreter':
+                if action['result']:
+                    try:
+                        pred = action['result']['text']
+                        match = re.search(r'!\[fig-[0-9]*\]\((.*?)\)', pred,
+                                          re.DOTALL)
+                        if match:
+                            img_pred = match.group(1)
+                        img2 = base64.b64decode(target)
+                        img2 = skimage.io.imread(img2, plugin='imageio')
+                        img1 = skimage.io.imread(img_pred, plugin='imageio')
+                        img1 = skimage.transform.resize(img1, img2.shape[:2])
+                        img1 = 255 * img1
+                        # Convert to integer data type pixels.
+                        img1 = img1.astype(np.uint8)
+                        ssim = skimage.metrics.structural_similarity(
+                            img1, img2, channel_axis=-1)
+                        # mse = skimage.metrics.mean_squared_error(img1, img2)
+                        # ssim greater better
+                        # mse smaller better but has no upper bound
+                        return ssim
+                    except Exception:
+                        return 0
+        # Fall back to 0
+        return 0
+    def save_results(self, origin_prompt, steps):
+        """Save the prediction result in a markdown and notebook format."""
+        def check_jupytext():
+            """Check requirements existence."""
+            from shutil import which
+            assert which('jupytext'), (
+                "Please install jupytext use 'pip install jupytext' to ensure"
+                'the conversion processes.')
+        check_jupytext()
+        for idx, (example_origin_prompt,
+                  example_steps) in enumerate(zip(origin_prompt, steps)):
+            markdown_lines = []
+            for prompt, step in zip(example_origin_prompt, example_steps):
+                for action in step[::-1]:
+                    if action['type'] == 'IPythonInterpreter':
+                        valid_action = action
+                        break
+                    # fall back to final action
+                    valid_action = step[-1]
+                markdown_lines.append(prompt)
+                markdown_lines.append('\n')
+                code_text = valid_action['args']['text']
+                code_text = extract_code(code_text)
+                code_text = '```python\n' + code_text + '\n```'
+                markdown_lines.append(code_text)
+                markdown_lines.append('\n')
+            md_file = f'experiment{idx}.md'
+            with open(md_file, 'w') as f:
+                f.writelines(markdown_lines)
+            # TODO: be careful for this
+            # convert markdown to ipynb and exectue with error tolerance
+            # subprocess.Popen(
+            #     "jupytext --to ipynb --pipe-fmt ipynb "
+            #     "--pipe 'jupyter nbconvert --to ipynb --execute "
+            #     f"--allow-errors --stdin --stdout' {md_file}",
+            #     shell=True)
+    def set_data_dir(self, work_dir):
+        """Set work directory and link data files for save notebook results."""
+        if self.user_data_dir:
+            if self.user_data_dir.endswith('/'):
+                basename = osp.basename(osp.split(self.user_data_dir)[0])
+            else:
+                basename = osp.basename(self.user_data_dir)
+            if not osp.exists(osp.join(self.output_dir, basename)):
+                os.symlink(self.user_data_dir,
+                           osp.join(self.output_dir, basename))
+        os.chdir(work_dir)
+    def unset_data_dir(self, work_dir):
+        """Change work directory and keep the symlink."""
+        os.chdir(work_dir)
+    def score(self, predictions: List, references: List, steps: List,
+              origin_prompt: List):
+        """Calculate accuracy."""
+        cwd = os.getcwd()
+        if self.output_dir:
+            if not osp.exists(self.output_dir):
+                os.makedirs(self.output_dir)
+            self.set_data_dir(self.output_dir)
+            self.save_results(origin_prompt, steps)
+            self.unset_data_dir(cwd)
+        num_cells_list = []
+        num_general_list = []
+        passed_list = []
+        correct_list = []
+        vis_list = []
+        for gold, single_steps in zip(references, steps):
+            tags = gold['tags']
+            outputs = gold['outputs']
+            num_cells = len(tags)
+            num_general = sum([tag == 'general' for tag in tags])
+            passed = sum([self.valid_step(step) for step in single_steps])
+            correct = 0
+            vis_sim = []
+            for tag, step, output in zip(tags, single_steps, outputs):
+                if tag == 'general':
+                    correct += self.correct_step(step, output)
+                elif tag == 'vis':
+                    vis_sim.append(self.vis_similarity_step(step, output))
+            num_cells_list.append(num_cells)
+            num_general_list.append(num_general)
+            passed_list.append(passed)
+            correct_list.append(correct)
+            if vis_sim:
+                vis_list.append(sum(vis_sim) / len(vis_sim))
+            else:
+                vis_list.append(-1)
+        if len([v for v in vis_list if v >= 0]) > 0:
+            visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
+                [v for v in vis_list if v >= 0])
+        else:
+            # not valid
+            visualize_similarity = -1
+        if sum(num_general_list) > 0:
+            general_accuracy = sum(correct_list) / sum(num_general_list)
+        else:
+            # not valid
+            general_accuracy = -1
+        result = dict(
+            executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
+            general_accuracy=general_accuracy * 100,
+            visualize_similarity=visualize_similarity * 100,
+            num_cells_list=num_cells_list,
+            num_general_list=num_general_list,
+            passed_list=passed_list,
+            correct_list=correct_list,
+            vis_list=vis_list,
+        )
+        return result
--- a/opencompass/datasets/ds1000_interpreter.py
+++ b/opencompass/datasets/ds1000_interpreter.py
+from typing import List, Optional, Union
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import LOAD_DATASET
+from .ds1000 import DS1000Dataset
+@LOAD_DATASET.register_module()
+class DS1000Dataset_Interperter(DS1000Dataset):
+    """Code interpreter version of DS1000."""
+    def load(
+        self,
+        path: str,
+        libs: Optional[Union[str, list]] = None,
+        mode: str = 'Insertion',
+    ):
+        dataset = super().load(path, libs, mode)
+        def preprocess(example):
+            """Get rid of unnecessary code block in prompt."""
+            prompt = example.pop('prompt')
+            example['prompt'] = prompt[:prompt.find('A:\n')].strip()
+            return example
+        dataset = dataset.map(preprocess)
+        return dataset
+class DS1000InterpreterEvaluator(BaseEvaluator):
+    """DS1000 interpreter evaluator.
+    Args:
+        action (str): Action for catching internal prediction.
+            Defaults to `PythonInterpreter`.
+    """
+    def __init__(self, action: str = 'PythonInterpreter'):
+        self.action = action
+    def get_action(self, step):
+        for s in step[::-1]:
+            if s['type'] == self.action:
+                return s
+    def score(self, predictions: List, references: List, steps: List):
+        """Calculate accuracy."""
+        action_scope = 0
+        follow_scope = 0
+        soft_success = 0
+        success = 0
+        total = len(references)
+        for step in steps:
+            s = self.get_action(step)
+            if s:
+                action_scope += 1
+                if not s['errmsg']:
+                    soft_success += 1
+                # assert must in code for testing
+                # otherwise the result will be True
+                if s['args'] and 'assert' in s['args']['text']:
+                    follow_scope += 1
+                    # successful result should count as passed
+                    if s['result']:
+                        success += s['result']['text'] == 'True'
+        result = dict(
+            action_pct=100 * action_scope / total,
+            soft_code_acc=100 * soft_success / total,
+            follow_acc=100 * follow_scope / total,
+            code_acc=100 * success / total,
+        )
+        return result
--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@@ -49,3 +49,65 @@ class Gsm8kEvaluator(BaseEvaluator):
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result
+class Gsm8kAgentEvaluator(BaseEvaluator):
+    """Gsm8k agent evaluator for soft condition.
+    Args:
+        action (str): Action for catching internal prediction.
+            Defaults to `PythonInterpreter`.
+    """
+    def __init__(self, action: str = 'PythonInterpreter'):
+        self.action = action
+    def soft_equal(self, pred, refer, step):
+        try:
+            soft_pred = step['result']['text']
+            if str(int(float(soft_pred))) == refer:
+                return True
+        except Exception:
+            # result might not exists
+            # text cannot convert to float
+            print(pred, soft_pred, refer)
+        return False
+    def get_action(self, step):
+        for s in step[::-1]:
+            if s['type'] == self.action:
+                return s
+    def score(self, predictions, references, steps):
+        """Calculate accuracy."""
+        row_reasoning_scope = 0
+        action_scope = 0
+        code_scope = 0
+        reasoning_scope = 0
+        final_scope = 0
+        total = len(references)
+        for pred, refer, step in zip(predictions, references, steps):
+            # if final answer right
+            if pred == refer:
+                if self.get_action(step):
+                    final_scope += 1
+                else:
+                    row_reasoning_scope += 1
+            else:
+                s = self.get_action(step)
+                if s:
+                    action_scope += 1
+                    if not s['errmsg']:
+                        code_scope += 1
+                        # whether action result is correct
+                        reasoning_scope += self.soft_equal(pred, refer, s)
+        result = dict(
+            follow_acc=100 * (row_reasoning_scope + final_scope) / total,
+            reasoning_acc=100 *
+            (reasoning_scope + final_scope + row_reasoning_scope) / total,
+            code_acc=100 * (code_scope + final_scope) / total,
+            action_acc=100 * (action_scope + final_scope) / total,
+        )
+        return result
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@@ -310,3 +310,65 @@ class MATHEvaluator(BaseEvaluator):
            return ss1 == ss2
        except:  # noqa
            return str1 == str2
+class MATHAgentEvaluator(MATHEvaluator):
+    """math agent evaluator for soft condition.
+    Args:
+        action (str): Action for catching internal prediction.
+            Defaults to `PythonInterpreter`.
+    """
+    def __init__(self, action: str = 'PythonInterpreter'):
+        self.action = action
+    def soft_equal(self, pred, refer, step):
+        try:
+            soft_pred = step['result']['text']
+            if self.is_equiv(soft_pred, refer):
+                return True
+        except Exception:
+            # result might not exists
+            print(pred, soft_pred, refer)
+        return False
+    def get_action(self, step):
+        for s in step[::-1]:
+            if s['type'] == self.action:
+                return s
+    def score(self, predictions, references, steps):
+        """Calculate accuracy."""
+        row_reasoning_scope = 0
+        action_scope = 0
+        code_scope = 0
+        reasoning_scope = 0
+        final_scope = 0
+        total = len(references)
+        for pred, refer, step in zip(predictions, references, steps):
+            # if final answer right
+            if self.is_equiv(pred, refer):
+                if self.get_action(step):
+                    final_scope += 1
+                else:
+                    row_reasoning_scope += 1
+            else:
+                s = self.get_action(step)
+                if s:
+                    action_scope += 1
+                    if not s['errmsg']:
+                        code_scope += 1
+                        # whether action result is correct
+                        reasoning_scope += self.soft_equal(pred, refer, s)
+        result = dict(
+            follow_acc=100 * (row_reasoning_scope + final_scope) / total,
+            reasoning_acc=100 *
+            (reasoning_scope + final_scope + row_reasoning_scope) / total,
+            code_acc=100 * (code_scope + final_scope) /
+            (action_scope + final_scope),
+            action_pct=100 * (action_scope + final_scope) / total,
+        )
+        return result
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
+import base64
+import io
+import logging
+import os
+import queue
+import re
+import signal
+import sys
+import traceback
+import uuid
+from typing import Optional, Tuple
+import json5
+import PIL.Image
+from jupyter_client import KernelManager
+from lagent.actions.base_action import BaseAction
+from lagent.schema import ActionReturn, ActionStatusCode
+WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
+DEFAULT_DESCRIPTION = """启动Jupter Kernel用于执行Python代码。"""
+START_CODE = """
+def input(*args, **kwargs):
+    raise NotImplementedError('Python input() function is disabled.')
+get_ipython().system = lambda *args: print('Assume we have this package, ! is disabled!')
+{}
+"""  # noqa
+class TimeoutError(Exception):
+    pass
+class IPythonInterpreter(BaseAction):
+    """A IPython executor that can execute Python scripts in a jupyter manner.
+    Args:
+        description (str): The description of the action. Defaults to
+            DEFAULT_DESCRIPTION.
+        name (str, optional): The name of the action. If None, the name will
+            be class nameDefaults to None.
+        enable (bool, optional): Whether the action is enabled. Defaults to
+            True.
+        disable_description (str, optional): The description of the action when
+            it is disabled. Defaults to None.
+        timeout (int): Upper bound of waiting time for Python script execution.
+            Defaults to 20.
+        user_data_dir (str): Specified the user data directory for files
+            loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
+            Defaults to `ENV`.
+    """
+    _KERNEL_CLIENTS = {}
+    def __init__(self,
+                 description: str = DEFAULT_DESCRIPTION,
+                 name: Optional[str] = None,
+                 enable: bool = True,
+                 disable_description: Optional[str] = None,
+                 timeout: int = 20,
+                 user_data_dir: str = 'ENV') -> None:
+        super().__init__(description, name, enable, disable_description)
+        self.timeout = timeout
+        if user_data_dir == 'ENV':
+            user_data_dir = os.environ.get('USER_DATA_DIR', '')
+        if user_data_dir:
+            user_data_dir = os.path.dirname(user_data_dir)
+            user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
+        self.user_data_dir = user_data_dir
+        self._initialized = False
+    @staticmethod
+    def start_kernel():
+        # start the kernel and manager
+        km = KernelManager()
+        km.start_kernel()
+        kc = km.client()
+        return km, kc
+    def initialize(self):
+        if self._initialized:
+            return
+        pid = os.getpid()
+        if pid not in self._KERNEL_CLIENTS:
+            self._KERNEL_CLIENTS[pid] = self.start_kernel()
+        self.kernel_manager, self.kernel_client = self._KERNEL_CLIENTS[pid]
+        self._initialized = True
+        self._call(START_CODE.format(self.user_data_dir), None)
+    def reset(self):
+        if not self._initialized:
+            self.initialize()
+        else:
+            code = "get_ipython().run_line_magic('reset', '-f')\n" + \
+                START_CODE.format(self.user_data_dir)
+            self._call(code, None)
+    def _call(self,
+              command: str,
+              timeout: Optional[int] = None) -> Tuple[str, bool]:
+        self.initialize()
+        command = extract_code(command)
+        # check previous remaining result
+        while True:
+            try:
+                msg = self.kernel_client.get_iopub_msg(timeout=1)
+                msg_type = msg['msg_type']
+                if msg_type == 'status':
+                    if msg['content'].get('execution_state') == 'idle':
+                        break
+            except queue.Empty:
+                # assume no result
+                break
+        self.kernel_client.execute(command)
+        def _inner_call():
+            result = ''
+            succeed = True
+            image_idx = 0
+            while True:
+                text = ''
+                image = ''
+                finished = False
+                msg_type = 'error'
+                try:
+                    msg = self.kernel_client.get_iopub_msg(timeout=10)
+                    msg_type = msg['msg_type']
+                    if msg_type == 'status':
+                        if msg['content'].get('execution_state') == 'idle':
+                            finished = True
+                    elif msg_type == 'execute_result':
+                        text = msg['content']['data'].get('text/plain', '')
+                        if 'image/png' in msg['content']['data']:
+                            image_b64 = msg['content']['data']['image/png']
+                            image_url = publish_image_to_local(image_b64)
+                            image_idx += 1
+                            image = '![fig-%03d](%s)' % (image_idx, image_url)
+                    elif msg_type == 'display_data':
+                        if 'image/png' in msg['content']['data']:
+                            image_b64 = msg['content']['data']['image/png']
+                            image_url = publish_image_to_local(image_b64)
+                            image_idx += 1
+                            image = '![fig-%03d](%s)' % (image_idx, image_url)
+                        else:
+                            text = msg['content']['data'].get('text/plain', '')
+                    elif msg_type == 'stream':
+                        msg_type = msg['content']['name']  # stdout, stderr
+                        text = msg['content']['text']
+                    elif msg_type == 'error':
+                        succeed = False
+                        text = escape_ansi('\n'.join(
+                            msg['content']['traceback']))
+                        if 'M6_CODE_INTERPRETER_TIMEOUT' in text:
+                            text = f'Timeout. No response after {timeout} seconds.'  # noqa
+                except queue.Empty:
+                    # stop current task in case break next input.
+                    self.kernel_manager.interrupt_kernel()
+                    succeed = False
+                    text = f'Timeout. No response after {timeout} seconds.'
+                    finished = True
+                except Exception:
+                    succeed = False
+                    text = 'The code interpreter encountered an unexpected error.'  # noqa
+                    logging.warning(''.join(
+                        traceback.format_exception(*sys.exc_info())))
+                    finished = True
+                if text:
+                    result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
+                if image:
+                    result += f'\n\n{image}'
+                if finished:
+                    return succeed, result
+        try:
+            if timeout:
+                def handler(signum, frame):
+                    raise TimeoutError()
+                signal.signal(signal.SIGALRM, handler)
+                signal.alarm(timeout)
+            succeed, result = _inner_call()
+        except TimeoutError:
+            succeed = False
+            text = 'The code interpreter encountered an unexpected error.'
+            result = f'\n\nerror:\n\n```\n{text}\n```'
+        finally:
+            if timeout:
+                signal.alarm(0)
+        result = result.lstrip('\n')
+        return succeed, result
+    def __call__(self,
+                 command: str,
+                 timeout: Optional[int] = None) -> ActionReturn:
+        tool_return = ActionReturn(url=None, args=None, type=self.name)
+        tool_return.args = dict(text=command)
+        succeed, result = self._call(command, timeout)
+        if succeed:
+            tool_return.result = dict(text=result)
+            tool_return.state = ActionStatusCode.SUCCESS
+        else:
+            tool_return.errmsg = repr(result)
+            tool_return.state = ActionStatusCode.API_ERROR
+        return tool_return
+def extract_code(text):
+    # Match triple backtick blocks first
+    triple_match = re.search(r'```[^\n]*\n(.+?)```', text, re.DOTALL)
+    # Match single backtick blocks second
+    single_match = re.search(r'`([^`]*)`', text, re.DOTALL)
+    if triple_match:
+        text = triple_match.group(1)
+    elif single_match:
+        text = single_match.group(1)
+    else:
+        try:
+            text = json5.loads(text)['code']
+        except Exception:
+            pass
+    # If no code blocks found, return original text
+    return text
+def escape_ansi(line):
+    ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
+    return ansi_escape.sub('', line)
+def publish_image_to_local(image_base64: str):
+    image_file = str(uuid.uuid4()) + '.png'
+    local_image_file = os.path.join(WORK_DIR, image_file)
+    png_bytes = base64.b64decode(image_base64)
+    assert isinstance(png_bytes, bytes)
+    bytes_io = io.BytesIO(png_bytes)
+    PIL.Image.open(bytes_io).save(local_image_file, 'png')
+    return local_image_file
+# local test for code interpreter
+def get_multiline_input(hint):
+    print(hint)
+    print('// Press ENTER to make a new line. Press CTRL-D to end input.')
+    lines = []
+    while True:
+        try:
+            line = input()
+        except EOFError:  # CTRL-D
+            break
+        lines.append(line)
+    print('// Input received.')
+    if lines:
+        return '\n'.join(lines)
+    else:
+        return ''
+if __name__ == '__main__':
+    code_interpreter = IPythonInterpreter()
+    while True:
+        print(code_interpreter(get_multiline_input('Enter python code:')))
--- a/opencompass/lagent/actions/python_interpreter.py
+++ b/opencompass/lagent/actions/python_interpreter.py
+import copy
+import io
+import signal
+from contextlib import redirect_stdout
+from typing import Any, Optional
+from lagent.actions.base_action import BaseAction
+from lagent.schema import ActionReturn, ActionStatusCode
+class TimeoutError(Exception):
+    pass
+def handler(signum, frame):
+    raise TimeoutError()
+class GenericRuntime:
+    GLOBAL_DICT = {}
+    LOCAL_DICT = None
+    HEADERS = []
+    def __init__(self):
+        self._global_vars = copy.copy(self.GLOBAL_DICT)
+        self._local_vars = copy.copy(
+            self.LOCAL_DICT) if self.LOCAL_DICT else None
+        for c in self.HEADERS:
+            self.exec_code(c)
+    def exec_code(self, code_piece: str) -> None:
+        exec(code_piece, self._global_vars)
+    def eval_code(self, expr: str) -> Any:
+        return eval(expr, self._global_vars)
+DEFAULT_DESCRIPTION = """用来执行Python代码。代码必须是一个函数，
+函数名必须得是 'solution'，代码对应你的思考过程。代码实例格式如下：
+```python
+# import 依赖包
+import xxx
+def solution():
+    # 初始化一些变量
+    variable_names_with_real_meaning = xxx
+    # 步骤一
+    mid_variable = func(variable_names_with_real_meaning)
+    # 步骤 x
+    mid_variable = func(mid_variable)
+    # 最后结果
+    final_answer = func(mid_variable)
+    return final_answer
+```"""
+class PythonInterpreter(BaseAction):
+    """A Python executor that can execute Python scripts.
+    Args:
+        description (str): The description of the action. Defaults to
+            DEFAULT_DESCRIPTION.
+        answer_symbol (str, Optional): the answer symbol from LLM
+        answer_expr (str, Optional): the answer function name of the Python
+            script. Default to 'solution()'.
+        answer_from_stdout (boolean): whether the execution results is from
+            stdout.
+        name (str, optional): The name of the action. If None, the name will
+            be class nameDefaults to None.
+        enable (bool, optional): Whether the action is enabled. Defaults to
+            True.
+        disable_description (str, optional): The description of the action when
+            it is disabled. Defaults to None.
+        timeout (int): Upper bound of waiting time for Python script execution.
+    """
+    def __init__(self,
+                 description: str = DEFAULT_DESCRIPTION,
+                 answer_symbol: Optional[str] = None,
+                 answer_expr: Optional[str] = 'solution()',
+                 answer_from_stdout: bool = False,
+                 name: Optional[str] = None,
+                 enable: bool = True,
+                 disable_description: Optional[str] = None,
+                 timeout: int = 20) -> None:
+        super().__init__(description, name, enable, disable_description)
+        self.answer_symbol = answer_symbol
+        self.answer_expr = answer_expr
+        self.answer_from_stdout = answer_from_stdout
+        self.timeout = timeout
+    def __call__(self, command: str) -> ActionReturn:
+        self.runtime = GenericRuntime()
+        signal.signal(signal.SIGALRM, handler)
+        signal.alarm(self.timeout)
+        try:
+            tool_return = self._call(command)
+        except TimeoutError as e:
+            tool_return = ActionReturn(url=None, args=None, type=self.name)
+            tool_return.errmsg = repr(e)
+            tool_return.state = ActionStatusCode.API_ERROR
+        finally:
+            signal.alarm(0)
+        return tool_return
+    def _call(self, command: str) -> ActionReturn:
+        tool_return = ActionReturn(url=None, args=None, type=self.name)
+        try:
+            if '```python' in command:
+                command = command.split('```python')[1].split('```')[0]
+            elif '```' in command:
+                command = command.split('```')[1].split('```')[0]
+            tool_return.args = dict(text='```python\n' + command + '\n```')
+            command = command.split('\n')
+            if self.answer_from_stdout:
+                program_io = io.StringIO()
+                with redirect_stdout(program_io):
+                    self.runtime.exec_code('\n'.join(command))
+                program_io.seek(0)
+                res = program_io.readlines()[-1]
+            elif self.answer_symbol:
+                self.runtime.exec_code('\n'.join(command))
+                res = self.runtime._global_vars[self.answer_symbol]
+            elif self.answer_expr:
+                self.runtime.exec_code('\n'.join(command))
+                res = self.runtime.eval_code(self.answer_expr)
+            else:
+                self.runtime.exec_code('\n'.join(command[:-1]))
+                res = True
+        except Exception as e:
+            tool_return.errmsg = repr(e)
+            tool_return.type = self.name
+            tool_return.state = ActionStatusCode.API_ERROR
+            return tool_return
+        try:
+            tool_return.result = dict(text=str(res))
+            tool_return.state = ActionStatusCode.SUCCESS
+        except Exception as e:
+            tool_return.errmsg = repr(e)
+            tool_return.type = self.name
+            tool_return.state = ActionStatusCode.API_ERROR
+        return tool_return
--- a/opencompass/lagent/agents/react.py
+++ b/opencompass/lagent/agents/react.py
+import re
 from typing import Union
 from lagent.actions import ActionExecutor
@@ -5,7 +6,7 @@ from lagent.agents.base_agent import BaseAgent
 from lagent.agents.react import ReActProtocol
 from lagent.llms.base_api import BaseAPIModel
 from lagent.llms.base_llm import BaseModel
-from lagent.schema import ActionReturn, AgentReturn
+from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
 class ReAct(BaseAgent):
@@ -32,15 +33,54 @@ class ReAct(BaseAgent):
                         action_executor=action_executor,
                         protocol=protocol)
+    def reset(self):
+        """Reset history."""
+        self._session_history = []
    def opencompass_adapter(self, prompt):
        # adapter for prompt parsing
-        from opencompass.utils.prompt import PromptList
        if isinstance(prompt, list):
+            system_prompt = []
+            merged_prompt = []
            for p in prompt:
-                if 'content' in p:
+                tmp_p = p.copy()
-                    p['prompt'] = p.pop('content')
+                if 'content' in tmp_p:
-            prompt = PromptList(prompt)
+                    tmp_p['prompt'] = tmp_p.pop('content')
-        return prompt
+                if 'role' in tmp_p:
+                    if tmp_p['role'] == 'system':
+                        # skip system prompt
+                        system_prompt.append(tmp_p['prompt'])
+                        continue
+                    # no system for meta template temperaily
+                    if tmp_p['role'] == 'assistant':
+                        tmp_p['role'] = 'BOT'
+                    if tmp_p['role'] == 'user':
+                        # merge previous system prompt to user
+                        system_str = ''.join(system_prompt)
+                        tmp_p['prompt'] = system_str + tmp_p['prompt']
+                        tmp_p['role'] = 'HUMAN'
+                        system_prompt = []
+                merged_prompt.append(tmp_p)
+            # merge if system still exists
+            if system_prompt:
+                if 'role' in merged_prompt[-1]:
+                    if merged_prompt[-1]['role'] == 'HUMAN':
+                        # append to the final human prompt
+                        merged_prompt[-1]['prompt'] += ''.join(system_prompt)
+                    else:
+                        # create a human prompt behind
+                        merged_prompt.append(
+                            dict(role='HUMAN', prompt=''.join(system_prompt)))
+        from opencompass.utils.prompt import PromptList
+        new_prompt = PromptList()
+        # adapter for meta template
+        new_prompt.append(dict(section='round', pos='begin'))
+        new_prompt.extend(merged_prompt)
+        new_prompt.append(dict(section='round', pos='end'))
+        return new_prompt
    def chat(self, message: str) -> AgentReturn:
        self._inner_history = []
@@ -61,6 +101,13 @@ class ReAct(BaseAgent):
                                            content=response))
            thought, action, action_input = self._protocol.parse(
                response, self._action_executor)
+            # TODO: hard code here
+            action_input = re.sub('<eoa>', '', action_input)
+            if 'tensorflow' in action_input:
+                # skip tensorflow currently
+                break
            action_return: ActionReturn = self._action_executor(
                action, action_input)
            action_return.thought = thought
@@ -79,3 +126,74 @@ class ReAct(BaseAgent):
        self._session_history.append(
            dict(role='assistant', content=agent_return.response))
        return agent_return
+class CIReAct(ReAct):
+    """Code Interpreter version of ReAct. The success state is different from
+    ReAct.
+    Args:
+        llm (BaseModel or BaseAPIModel): a LLM service which can chat
+            and act as backend.
+        action_executor (ActionExecutor): an action executor to manage
+            all actions and their response.
+        protocol (ReActProtocol): a wrapper to generate prompt and
+            parse the response from LLM / actions.
+        max_turn (int): the maximum number of trails for LLM to generate
+            plans that can be successfully parsed by ReWOO protocol.
+    """
+    def reset(self):
+        """Reset history and reset action if suit the case."""
+        self._session_history = []
+        # hard code here
+        from opencompass.lagent.actions.ipython_interpreter import \
+            IPythonInterpreter
+        b = IPythonInterpreter()
+        b.reset()
+    def chat(self, message: str) -> AgentReturn:
+        self._inner_history = []
+        # append the user message for session history
+        self._session_history.append(dict(role='user', content=message))
+        agent_return = AgentReturn()
+        force_stop = False
+        default_response = '对不起，我无法回答你的问题'
+        for turn in range(self.max_turn):
+            prompt = self._protocol.format(
+                chat_history=self.session_history,
+                inner_step=self._inner_history,
+                action_executor=self._action_executor,
+                force_stop=force_stop)
+            prompt = self.opencompass_adapter(prompt)
+            # allow single generation
+            response = self._llm.generate_from_template([prompt], 512)[0]
+            self._inner_history.append(dict(role='assistant',
+                                            content=response))
+            thought, action, action_input = self._protocol.parse(
+                response, self._action_executor)
+            action_return: ActionReturn = self._action_executor(
+                action, action_input)
+            action_return.thought = thought
+            agent_return.actions.append(action_return)
+            if action_return.state == ActionStatusCode.SUCCESS:
+                # if success, stash model response and system response
+                self._session_history.append(
+                    dict(role='assistant', content=action_return.args['text']))
+                self._session_history.append(
+                    dict(
+                        role='system',
+                        content=self._protocol.format_response(action_return)))
+                agent_return.response = action_return.result['text']
+                return agent_return
+            elif action_return.type == self._action_executor.invalid_action.name:  # noqa
+                action_return.errmsg = 'The action is invalid, please check the action name.'  # noqa
+            self._inner_history.append(
+                dict(role='system',
+                     content=self._protocol.format_response(action_return)))
+            if turn == self.max_turn - 1:
+                force_stop = True
+        agent_return.response = default_response
+        self._session_history.append(
+            dict(role='assistant', content=agent_return.response))
+        return agent_return
--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@@ -14,12 +14,19 @@ class LagentAgent:
    https://github.com/InternLM/lagent.
    """
-    def __init__(self, agent_type, llm, actions=None, protocol=None, **kwargs):
+    def __init__(self,
+                 agent_type,
+                 llm,
+                 actions=None,
+                 protocol=None,
+                 mutli_rounds=False,
+                 **kwargs):
        llm = REGISTRY.build(llm)
        agent_cfg = {'type': agent_type, 'llm': llm, **kwargs}
        if actions is not None:
            from lagent.actions import ActionExecutor
            executor = ActionExecutor(
                [REGISTRY.build(action) for action in actions])
            agent_cfg['action_executor'] = executor
@@ -28,6 +35,7 @@ class LagentAgent:
            agent_cfg['protocol'] = protocol
        self.agent = REGISTRY.build(agent_cfg)
+        self.mutli_rounds = mutli_rounds
    def add_example(self, example):
        # format example in protocol if needed
@@ -39,10 +47,11 @@ class LagentAgent:
            get_logger().warning('Protocal template does not have example'
                                 ' placeholder, please check your template.')
-    def chat(self, user_input, ice=None) -> Tuple[str, List[dict]]:
+    def one_round_chat(self, user_input, ice=None) -> Tuple[str, List[dict]]:
+        """One round chat with agent."""
        from lagent.schema import ActionReturn, AgentReturn
        generation: AgentReturn = self.agent.chat(user_input)
-        self.agent._session_history = []  # clear agent history
        answer = generation.response
        steps = []
@@ -60,11 +69,26 @@ class LagentAgent:
                ))
        return answer, steps
+    def chat(self, user_input, ice=None) -> Tuple[str, List[dict]]:
+        """Chat with agent."""
+        if self.mutli_rounds:
+            steps = []
+            for single_input in user_input:
+                answer, one_round_steps = self.one_round_chat(single_input)
+                steps.append(one_round_steps)
+        else:
+            answer, steps = self.one_round_chat(user_input)
-FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""  # noqa
+        self.agent.reset()  # clear agent history
+        return answer, steps
+FORCE_STOP_PROMPT_EN = (
+    """You should directly give results based on history information."""  # noqa
+)
 FEWSHOT_INSTRUCTION = """\
-You are a assistant who can utilize external tools.
+You are an assistant who can utilize external tools.
 {{tool_description}}
 To use a tool, please use the following format:
 ```
@@ -82,14 +106,12 @@ please using the following format to reply:
 {{thought}} the thought process to get the final answer
 {{finish}} final answer
 ```
-Example:
 {example}
 Begin!
 """  # noqa
-PYTHON_INTERPRETER_DESCRIPTION = '''\
+PYTHON_INTERPRETER_DESCRIPTION = """\
 It can run a Python code. The code must be a valid code that contains only python method, and the method' name must be 'solution' and returns a dict, which key is variable name. The libraries I recommend are sympy and scipy. the format is:
 ```python
 # import packages
@@ -102,21 +124,28 @@ def solution():
    # final answer
    final_answer = func(mid_variable)
    return final_answer
-```''' # noqa
+```"""  # noqa
 class CodeAgent:
-    """Agent wrapper for Lagent."""
+    """Code Agent wrapper for Lagent."""
    def __new__(self, llm, **kwargs):
-        from lagent.actions import PythonInterpreter
        from lagent.agents.react import ReActProtocol
+        from opencompass.lagent.actions.python_interpreter import \
+            PythonInterpreter
+        mutli_rounds = kwargs.pop('mutli_rounds', False)
        agent_type = kwargs.pop('agent_type', ReAct)
        max_turn = kwargs.pop('max_turn', 3)
-        actions = kwargs.pop('actions', [
+        actions = kwargs.pop(
+            'actions',
+            [
                dict(type=PythonInterpreter,
                     description=PYTHON_INTERPRETER_DESCRIPTION),
-        ])
+            ],
+        )
        protocol = kwargs.pop(
            'protocol',
            dict(
@@ -124,10 +153,12 @@ class CodeAgent:
                call_protocol=FEWSHOT_INSTRUCTION,
                force_stop=FORCE_STOP_PROMPT_EN,
                finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
-            ))
+            ),
+        )
        return LagentAgent(agent_type=agent_type,
                           llm=llm,
                           max_turn=max_turn,
                           actions=actions,
                           protocol=protocol,
+                           mutli_rounds=mutli_rounds,
                           **kwargs)
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@@ -77,8 +77,13 @@ class AgentInferencer(BaseInferencer):
        logger.info('Starting inference process...')
        for idx, ice_indices in tqdm(enumerate(ice_idx_list[start:], start),
                                     disable=not self.is_main_process):
-            user_input = retriever.generate_prompt_for_generate_task(
+            # TODO: This will break the Prompt template
-                idx, ice='', prompt_template=prompt_template)
+            # get user input directly without formatting prompt
+            #
+            # user_input = retriever.generate_prompt_for_generate_task(
+            #     idx, ice='', prompt_template=prompt_template)
+            user_input = retriever.dataset_reader.dataset['test'][
+                retriever.dataset_reader.input_columns[0]][idx]
            gold = retriever.dataset_reader.dataset['test'][
                retriever.dataset_reader.output_column][idx]