"test/vscode:/vscode.git/clone" did not exist on "304802889728707c2a162322ce18686169e732ea"
Unverified Commit bb2ecf41 authored by Hubert's avatar Hubert Committed by GitHub
Browse files

[Feat] Support cibench (#538)

* [Feat] support cidataset

* [Feat] support cidataset

* [Feat] support cidataset

* [Feat] support cidataset

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* rename cibench

* rename cibench

* rename cibench

* rename cibench

* minor fix

* minor fix

* minor fix
parent 36360bdf
from mmengine.config import read_base
with read_base():
from .CIBench_gen_eb42f9 import ci_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
cibench_eval_cfg = {
lib: dict(
evaluator=dict(
type=CIBenchEvaluator,
output_dir=f'output_data/cibench/{lib}'),
pred_role="BOT",
)
for lib in libs
}
cibench_datasets = [
dict(
abbr=f"cibench_{lib}",
type=CIBenchDataset,
path=f"./data/cibench/{lib}",
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg[lib],
) for lib in libs
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import DS1000Dataset_Interperter, DS1000InterpreterEvaluator
ds1000_example = """
In the following task, you should generate code with one assertion to testify the correctness of your code.
Example:
<HUMAN>Problem:
How do I get the dimensions of an array? For instance, this is (2, 2):
a = np.array([[1,2],[3,4]])
<ASSISTANT>{thought} In Python, Numpy provides a method called `shape` which helps to get the dimensions of an array.
{action} PythonInterpreter
{action_input}
```python
import numpy as np
def solution(x):
# Convert to np.ndarray
x = np.array(x)
# Getting the dimensions of the array
dimensions = x.shape
return dimensions
assert solution([[1,2],[3,4]]) == (2, 2)
```
<SYSTEM>{response}True
<ASSISTANT> {thought} By running this code, you can get the dimensions of an array.
{finish}
```python
import numpy as np
def solution(x):
# Convert to np.ndarray
x = np.array(x)
# Getting the dimensions of the array
dimensions = x.shape
return dimensions
```
"""
ds1000_reader_cfg = dict(
input_columns=["prompt"],
output_column="test_column",
train_split="test",
test_split="test",
)
ds1000_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, example=ds1000_example),
)
ds1000_eval_cfg = dict(
evaluator=dict(type=DS1000InterpreterEvaluator),
pred_role="BOT",
)
# The DS-1000 dataset can be downloaded from
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
# Matplotlib cannot fit this setting
ds1000_datasets = [
dict(
abbr=f"ds1000_{lib}",
type=DS1000Dataset_Interperter, # bustm share the same format with AFQMC
path="./data/ds1000_data/",
libs=f"{lib}",
reader_cfg=ds1000_reader_cfg,
infer_cfg=ds1000_infer_cfg,
eval_cfg=ds1000_eval_cfg,
)
for lib in [
"Pandas",
"Numpy",
# 'Tensorflow', # error using tensorflow, skipped temporarily
"Scipy",
"Sklearn",
"Pytorch",
]
]
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kAgentEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess
# This config is for code interpreter # This config is for code interpreter
gsm8k_example = """ gsm8k_example = """
Example:
<HUMAN>A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there? <HUMAN>A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?
<ASSISTANT>{thought} We need to calculate the total number of fruits. The total number of fruits in the first three baskets is given, while for the fourth basket, we need to subtract 2 from each fruit category. We can solve this problem using simple arithmetic. <ASSISTANT>{thought} We need to calculate the total number of fruits. The total number of fruits in the first three baskets is given, while for the fourth basket, we need to subtract 2 from each fruit category. We can solve this problem using simple arithmetic.
{action} PythonInterpreter {action} PythonInterpreter
...@@ -68,7 +69,7 @@ gsm8k_infer_cfg = dict( ...@@ -68,7 +69,7 @@ gsm8k_infer_cfg = dict(
inferencer=dict(type=AgentInferencer, example=gsm8k_example)) inferencer=dict(type=AgentInferencer, example=gsm8k_example))
gsm8k_eval_cfg = dict( gsm8k_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=Gsm8kAgentEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess), pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess from opencompass.datasets import MATHDataset, MATHAgentEvaluator, math_postprocess
# This config is for code interpreter # This config is for code interpreter
math_example = """ math_example = """
Example:
<HUMAN>Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$. <HUMAN>Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.
<ASSISTANT>{thought} The domain restrictions are determined by: <ASSISTANT>{thought} The domain restrictions are determined by:
...@@ -45,7 +47,7 @@ math_infer_cfg = dict( ...@@ -45,7 +47,7 @@ math_infer_cfg = dict(
inferencer=dict(type=AgentInferencer, example=math_example)) inferencer=dict(type=AgentInferencer, example=math_example))
math_eval_cfg = dict( math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator), evaluator=dict(type=MATHAgentEvaluator),
pred_postprocessor=dict(type=math_postprocess)) pred_postprocessor=dict(type=math_postprocess))
math_datasets = [ math_datasets = [
......
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models import OpenAI
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
from opencompass.lagent.agents.react import CIReAct
from opencompass.models.lagent import CodeAgent
from lagent.agents.react import ReActProtocol
with read_base():
from .datasets.CIBench.CIBench_gen_eb42f9 import cibench_datasets as datasets
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
FEWSHOT_INSTRUCTION = """\
You are an assistant who can utilize external tools.
{tool_description}
To use a tool, please response with the following format:
```
{thought} Think what you need to solve, do you need to use tools?
{action} The tool name, should be one of [{action_names}].
{action_input} The input to the tool that you want to use.
```
The tool will give you response after your response using the following format:
```
{response} the results after call the tool.
```
Therefore DO NOT generate tool response by yourself.
Also please follow the guidelines:
1. Always use code interpreter to solve the problem.
2. The generated codes should always in a markdown code block format.
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
Begin!
"""
models = [
dict(
abbr='gpt-3.5-turbo',
type=CodeAgent,
agent_type=CIReAct,
mutli_rounds=True,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(
type=IPythonInterpreter,
description=
'''It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.
'''),
],
protocol=dict(
type=ReActProtocol,
call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN,
action=dict(role='ACTION', begin='Tool:', end='\n'),
action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
response=dict(role='RESPONSE', begin='Tool Response:', end='\n'),
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
),
batch_size=8,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=50, gen_task_coef=1),
runner=dict(
type=SlurmRunner, max_num_workers=8, retry=2,
task=dict(type=OpenICLInferTask)),
)
...@@ -6,8 +6,8 @@ from opencompass.models import OpenAI, HuggingFaceCausalLM ...@@ -6,8 +6,8 @@ from opencompass.models import OpenAI, HuggingFaceCausalLM
from opencompass.models.lagent import CodeAgent from opencompass.models.lagent import CodeAgent
with read_base(): with read_base():
from .datasets.math.math_gen_6cca30 import math_datasets from .datasets.math.math_gen_943d32 import math_datasets
from .datasets.gsm8k.gsm8k_gen_e7ef64 import gsm8k_datasets from .datasets.gsm8k.gsm8k_gen_57b0b1 import gsm8k_datasets
datasets = [] datasets = []
datasets += gsm8k_datasets datasets += gsm8k_datasets
......
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models import OpenAI
from opencompass.models.lagent import CodeAgent
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
PYTHON_INTERPRETER_DESCRIPTION = """\
It can run a Python code. The code must be a valid code that contains only python method.
"""
actions = [
dict(
type=PythonInterpreter,
description=PYTHON_INTERPRETER_DESCRIPTION,
answer_expr=None,
)
]
with read_base():
from .datasets.ds1000.ds1000_gen_5c4bec import ds1000_datasets as datasets
models = [
dict(
abbr='gpt-3.5-react',
type=CodeAgent,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=actions,
batch_size=8),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=40000),
runner=dict(
type=LocalRunner, max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)
\ No newline at end of file
...@@ -12,6 +12,7 @@ from .c3 import * # noqa: F401, F403 ...@@ -12,6 +12,7 @@ from .c3 import * # noqa: F401, F403
from .cb import * # noqa: F401, F403 from .cb import * # noqa: F401, F403
from .ceval import * # noqa: F401, F403 from .ceval import * # noqa: F401, F403
from .chid import * # noqa: F401, F403 from .chid import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403 from .civilcomments import * # noqa: F401, F403
from .cluewsc import * # noqa: F401, F403 from .cluewsc import * # noqa: F401, F403
from .cmb import * # noqa: F401, F403 from .cmb import * # noqa: F401, F403
...@@ -26,6 +27,7 @@ from .cvalues import * # noqa: F401, F403 ...@@ -26,6 +27,7 @@ from .cvalues import * # noqa: F401, F403
from .drcd import * # noqa: F401, F403 from .drcd import * # noqa: F401, F403
from .drop import * # noqa: F401, F403 from .drop import * # noqa: F401, F403
from .ds1000 import * # noqa: F401, F403 from .ds1000 import * # noqa: F401, F403
from .ds1000_interpreter import * # noqa: F401, F403
from .eprstmt import * # noqa: F401, F403 from .eprstmt import * # noqa: F401, F403
from .flores import * # noqa: F401, F403 from .flores import * # noqa: F401, F403
from .game24 import * # noqa: F401, F403 from .game24 import * # noqa: F401, F403
......
import json
import os
import os.path as osp
import re
from typing import List, Optional
import numpy as np
from datasets import Dataset
from opencompass.lagent.actions.ipython_interpreter import extract_code
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
def load_experiment(file: str) -> dict:
"""Load single experiment file with solutions."""
with open(file, 'r') as f:
notebook = json.load(f)
example = notebook['cells']
questions = []
outputs = []
tags = []
for cell in example:
if cell['cell_type'] == 'markdown':
text = ''.join(cell['source'])
# append the formatted text
questions.append(text)
elif cell['cell_type'] == 'code':
if cell['outputs'] and 'data' in cell['outputs'][-1]:
if 'image/png' in cell['outputs'][-1]['data']:
# skip vis temporarily due to lack of evaluation
tags.append('vis')
outputs.append(
cell['outputs'][-1]['data']['image/png'])
elif 'text/plain' in cell['outputs'][-1]['data']:
tags.append('general')
outputs.append(''.join(
cell['outputs'][-1]['data']['text/plain']))
else:
tags.append('executable')
outputs.append(None)
return dict(
experiment=file,
questions=questions,
references=dict(outputs=outputs, tags=tags, experiment=file),
)
@LOAD_DATASET.register_module()
class CIBenchDataset(BaseDataset):
"""Code Interpreter dataset."""
@staticmethod
def load(path: str):
"""Load whole dataset."""
data_list = []
for cwd, dirs, files in os.walk(path):
dirs.sort()
files.sort()
for f in files:
if '.ipynb' in f:
try:
data = load_experiment(os.path.join(cwd, f))
except Exception:
print(f'Error with file {os.path.join(cwd, f)}')
continue
data_list.append(data)
dataset = Dataset.from_list(data_list)
return dataset
class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset.
Args:
output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format.
user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir.
"""
def __init__(self,
output_dir: Optional[str] = None,
user_data_dir: str = 'ENV') -> None:
# TODO: should use work dir for this task.
self.output_dir = output_dir
if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '')
self.user_data_dir = user_data_dir
@staticmethod
def valid_step(step):
"""Whether the step is executable and valid."""
# Found the latest code interpreter to determine valid
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['errmsg']:
return False
else:
return True
# No code interpreter for this step, reckon as False
return False
@staticmethod
def correct_step(step, target):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['result']:
try:
pred = action['result']['text']
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
if match:
out = match.group(1)
return out == target or out in target
except Exception:
return False
# Fall back to False
return False
@staticmethod
def vis_similarity_step(step, target):
"""Whether the step output image has the same structure similarity with
the given images."""
# Found the latest code interpreter to determine correct
import base64
import skimage
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['result']:
try:
pred = action['result']['text']
match = re.search(r'!\[fig-[0-9]*\]\((.*?)\)', pred,
re.DOTALL)
if match:
img_pred = match.group(1)
img2 = base64.b64decode(target)
img2 = skimage.io.imread(img2, plugin='imageio')
img1 = skimage.io.imread(img_pred, plugin='imageio')
img1 = skimage.transform.resize(img1, img2.shape[:2])
img1 = 255 * img1
# Convert to integer data type pixels.
img1 = img1.astype(np.uint8)
ssim = skimage.metrics.structural_similarity(
img1, img2, channel_axis=-1)
# mse = skimage.metrics.mean_squared_error(img1, img2)
# ssim greater better
# mse smaller better but has no upper bound
return ssim
except Exception:
return 0
# Fall back to 0
return 0
def save_results(self, origin_prompt, steps):
"""Save the prediction result in a markdown and notebook format."""
def check_jupytext():
"""Check requirements existence."""
from shutil import which
assert which('jupytext'), (
"Please install jupytext use 'pip install jupytext' to ensure"
'the conversion processes.')
check_jupytext()
for idx, (example_origin_prompt,
example_steps) in enumerate(zip(origin_prompt, steps)):
markdown_lines = []
for prompt, step in zip(example_origin_prompt, example_steps):
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
valid_action = action
break
# fall back to final action
valid_action = step[-1]
markdown_lines.append(prompt)
markdown_lines.append('\n')
code_text = valid_action['args']['text']
code_text = extract_code(code_text)
code_text = '```python\n' + code_text + '\n```'
markdown_lines.append(code_text)
markdown_lines.append('\n')
md_file = f'experiment{idx}.md'
with open(md_file, 'w') as f:
f.writelines(markdown_lines)
# TODO: be careful for this
# convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen(
# "jupytext --to ipynb --pipe-fmt ipynb "
# "--pipe 'jupyter nbconvert --to ipynb --execute "
# f"--allow-errors --stdin --stdout' {md_file}",
# shell=True)
def set_data_dir(self, work_dir):
"""Set work directory and link data files for save notebook results."""
if self.user_data_dir:
if self.user_data_dir.endswith('/'):
basename = osp.basename(osp.split(self.user_data_dir)[0])
else:
basename = osp.basename(self.user_data_dir)
if not osp.exists(osp.join(self.output_dir, basename)):
os.symlink(self.user_data_dir,
osp.join(self.output_dir, basename))
os.chdir(work_dir)
def unset_data_dir(self, work_dir):
"""Change work directory and keep the symlink."""
os.chdir(work_dir)
def score(self, predictions: List, references: List, steps: List,
origin_prompt: List):
"""Calculate accuracy."""
cwd = os.getcwd()
if self.output_dir:
if not osp.exists(self.output_dir):
os.makedirs(self.output_dir)
self.set_data_dir(self.output_dir)
self.save_results(origin_prompt, steps)
self.unset_data_dir(cwd)
num_cells_list = []
num_general_list = []
passed_list = []
correct_list = []
vis_list = []
for gold, single_steps in zip(references, steps):
tags = gold['tags']
outputs = gold['outputs']
num_cells = len(tags)
num_general = sum([tag == 'general' for tag in tags])
passed = sum([self.valid_step(step) for step in single_steps])
correct = 0
vis_sim = []
for tag, step, output in zip(tags, single_steps, outputs):
if tag == 'general':
correct += self.correct_step(step, output)
elif tag == 'vis':
vis_sim.append(self.vis_similarity_step(step, output))
num_cells_list.append(num_cells)
num_general_list.append(num_general)
passed_list.append(passed)
correct_list.append(correct)
if vis_sim:
vis_list.append(sum(vis_sim) / len(vis_sim))
else:
vis_list.append(-1)
if len([v for v in vis_list if v >= 0]) > 0:
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
[v for v in vis_list if v >= 0])
else:
# not valid
visualize_similarity = -1
if sum(num_general_list) > 0:
general_accuracy = sum(correct_list) / sum(num_general_list)
else:
# not valid
general_accuracy = -1
result = dict(
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
general_accuracy=general_accuracy * 100,
visualize_similarity=visualize_similarity * 100,
num_cells_list=num_cells_list,
num_general_list=num_general_list,
passed_list=passed_list,
correct_list=correct_list,
vis_list=vis_list,
)
return result
from typing import List, Optional, Union
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from .ds1000 import DS1000Dataset
@LOAD_DATASET.register_module()
class DS1000Dataset_Interperter(DS1000Dataset):
"""Code interpreter version of DS1000."""
def load(
self,
path: str,
libs: Optional[Union[str, list]] = None,
mode: str = 'Insertion',
):
dataset = super().load(path, libs, mode)
def preprocess(example):
"""Get rid of unnecessary code block in prompt."""
prompt = example.pop('prompt')
example['prompt'] = prompt[:prompt.find('A:\n')].strip()
return example
dataset = dataset.map(preprocess)
return dataset
class DS1000InterpreterEvaluator(BaseEvaluator):
"""DS1000 interpreter evaluator.
Args:
action (str): Action for catching internal prediction.
Defaults to `PythonInterpreter`.
"""
def __init__(self, action: str = 'PythonInterpreter'):
self.action = action
def get_action(self, step):
for s in step[::-1]:
if s['type'] == self.action:
return s
def score(self, predictions: List, references: List, steps: List):
"""Calculate accuracy."""
action_scope = 0
follow_scope = 0
soft_success = 0
success = 0
total = len(references)
for step in steps:
s = self.get_action(step)
if s:
action_scope += 1
if not s['errmsg']:
soft_success += 1
# assert must in code for testing
# otherwise the result will be True
if s['args'] and 'assert' in s['args']['text']:
follow_scope += 1
# successful result should count as passed
if s['result']:
success += s['result']['text'] == 'True'
result = dict(
action_pct=100 * action_scope / total,
soft_code_acc=100 * soft_success / total,
follow_acc=100 * follow_scope / total,
code_acc=100 * success / total,
)
return result
...@@ -49,3 +49,65 @@ class Gsm8kEvaluator(BaseEvaluator): ...@@ -49,3 +49,65 @@ class Gsm8kEvaluator(BaseEvaluator):
details.append(detail) details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details} result = {'accuracy': 100 * correct / count, 'details': details}
return result return result
class Gsm8kAgentEvaluator(BaseEvaluator):
"""Gsm8k agent evaluator for soft condition.
Args:
action (str): Action for catching internal prediction.
Defaults to `PythonInterpreter`.
"""
def __init__(self, action: str = 'PythonInterpreter'):
self.action = action
def soft_equal(self, pred, refer, step):
try:
soft_pred = step['result']['text']
if str(int(float(soft_pred))) == refer:
return True
except Exception:
# result might not exists
# text cannot convert to float
print(pred, soft_pred, refer)
return False
def get_action(self, step):
for s in step[::-1]:
if s['type'] == self.action:
return s
def score(self, predictions, references, steps):
"""Calculate accuracy."""
row_reasoning_scope = 0
action_scope = 0
code_scope = 0
reasoning_scope = 0
final_scope = 0
total = len(references)
for pred, refer, step in zip(predictions, references, steps):
# if final answer right
if pred == refer:
if self.get_action(step):
final_scope += 1
else:
row_reasoning_scope += 1
else:
s = self.get_action(step)
if s:
action_scope += 1
if not s['errmsg']:
code_scope += 1
# whether action result is correct
reasoning_scope += self.soft_equal(pred, refer, s)
result = dict(
follow_acc=100 * (row_reasoning_scope + final_scope) / total,
reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) / total,
action_acc=100 * (action_scope + final_scope) / total,
)
return result
...@@ -310,3 +310,65 @@ class MATHEvaluator(BaseEvaluator): ...@@ -310,3 +310,65 @@ class MATHEvaluator(BaseEvaluator):
return ss1 == ss2 return ss1 == ss2
except: # noqa except: # noqa
return str1 == str2 return str1 == str2
class MATHAgentEvaluator(MATHEvaluator):
"""math agent evaluator for soft condition.
Args:
action (str): Action for catching internal prediction.
Defaults to `PythonInterpreter`.
"""
def __init__(self, action: str = 'PythonInterpreter'):
self.action = action
def soft_equal(self, pred, refer, step):
try:
soft_pred = step['result']['text']
if self.is_equiv(soft_pred, refer):
return True
except Exception:
# result might not exists
print(pred, soft_pred, refer)
return False
def get_action(self, step):
for s in step[::-1]:
if s['type'] == self.action:
return s
def score(self, predictions, references, steps):
"""Calculate accuracy."""
row_reasoning_scope = 0
action_scope = 0
code_scope = 0
reasoning_scope = 0
final_scope = 0
total = len(references)
for pred, refer, step in zip(predictions, references, steps):
# if final answer right
if self.is_equiv(pred, refer):
if self.get_action(step):
final_scope += 1
else:
row_reasoning_scope += 1
else:
s = self.get_action(step)
if s:
action_scope += 1
if not s['errmsg']:
code_scope += 1
# whether action result is correct
reasoning_scope += self.soft_equal(pred, refer, s)
result = dict(
follow_acc=100 * (row_reasoning_scope + final_scope) / total,
reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) /
(action_scope + final_scope),
action_pct=100 * (action_scope + final_scope) / total,
)
return result
import base64
import io
import logging
import os
import queue
import re
import signal
import sys
import traceback
import uuid
from typing import Optional, Tuple
import json5
import PIL.Image
from jupyter_client import KernelManager
from lagent.actions.base_action import BaseAction
from lagent.schema import ActionReturn, ActionStatusCode
WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
DEFAULT_DESCRIPTION = """启动Jupter Kernel用于执行Python代码。"""
START_CODE = """
def input(*args, **kwargs):
raise NotImplementedError('Python input() function is disabled.')
get_ipython().system = lambda *args: print('Assume we have this package, ! is disabled!')
{}
""" # noqa
class TimeoutError(Exception):
pass
class IPythonInterpreter(BaseAction):
"""A IPython executor that can execute Python scripts in a jupyter manner.
Args:
description (str): The description of the action. Defaults to
DEFAULT_DESCRIPTION.
name (str, optional): The name of the action. If None, the name will
be class nameDefaults to None.
enable (bool, optional): Whether the action is enabled. Defaults to
True.
disable_description (str, optional): The description of the action when
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20.
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
"""
_KERNEL_CLIENTS = {}
def __init__(self,
description: str = DEFAULT_DESCRIPTION,
name: Optional[str] = None,
enable: bool = True,
disable_description: Optional[str] = None,
timeout: int = 20,
user_data_dir: str = 'ENV') -> None:
super().__init__(description, name, enable, disable_description)
self.timeout = timeout
if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '')
if user_data_dir:
user_data_dir = os.path.dirname(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
self.user_data_dir = user_data_dir
self._initialized = False
@staticmethod
def start_kernel():
# start the kernel and manager
km = KernelManager()
km.start_kernel()
kc = km.client()
return km, kc
def initialize(self):
if self._initialized:
return
pid = os.getpid()
if pid not in self._KERNEL_CLIENTS:
self._KERNEL_CLIENTS[pid] = self.start_kernel()
self.kernel_manager, self.kernel_client = self._KERNEL_CLIENTS[pid]
self._initialized = True
self._call(START_CODE.format(self.user_data_dir), None)
def reset(self):
if not self._initialized:
self.initialize()
else:
code = "get_ipython().run_line_magic('reset', '-f')\n" + \
START_CODE.format(self.user_data_dir)
self._call(code, None)
def _call(self,
command: str,
timeout: Optional[int] = None) -> Tuple[str, bool]:
self.initialize()
command = extract_code(command)
# check previous remaining result
while True:
try:
msg = self.kernel_client.get_iopub_msg(timeout=1)
msg_type = msg['msg_type']
if msg_type == 'status':
if msg['content'].get('execution_state') == 'idle':
break
except queue.Empty:
# assume no result
break
self.kernel_client.execute(command)
def _inner_call():
result = ''
succeed = True
image_idx = 0
while True:
text = ''
image = ''
finished = False
msg_type = 'error'
try:
msg = self.kernel_client.get_iopub_msg(timeout=10)
msg_type = msg['msg_type']
if msg_type == 'status':
if msg['content'].get('execution_state') == 'idle':
finished = True
elif msg_type == 'execute_result':
text = msg['content']['data'].get('text/plain', '')
if 'image/png' in msg['content']['data']:
image_b64 = msg['content']['data']['image/png']
image_url = publish_image_to_local(image_b64)
image_idx += 1
image = '![fig-%03d](%s)' % (image_idx, image_url)
elif msg_type == 'display_data':
if 'image/png' in msg['content']['data']:
image_b64 = msg['content']['data']['image/png']
image_url = publish_image_to_local(image_b64)
image_idx += 1
image = '![fig-%03d](%s)' % (image_idx, image_url)
else:
text = msg['content']['data'].get('text/plain', '')
elif msg_type == 'stream':
msg_type = msg['content']['name'] # stdout, stderr
text = msg['content']['text']
elif msg_type == 'error':
succeed = False
text = escape_ansi('\n'.join(
msg['content']['traceback']))
if 'M6_CODE_INTERPRETER_TIMEOUT' in text:
text = f'Timeout. No response after {timeout} seconds.' # noqa
except queue.Empty:
# stop current task in case break next input.
self.kernel_manager.interrupt_kernel()
succeed = False
text = f'Timeout. No response after {timeout} seconds.'
finished = True
except Exception:
succeed = False
text = 'The code interpreter encountered an unexpected error.' # noqa
logging.warning(''.join(
traceback.format_exception(*sys.exc_info())))
finished = True
if text:
result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
if image:
result += f'\n\n{image}'
if finished:
return succeed, result
try:
if timeout:
def handler(signum, frame):
raise TimeoutError()
signal.signal(signal.SIGALRM, handler)
signal.alarm(timeout)
succeed, result = _inner_call()
except TimeoutError:
succeed = False
text = 'The code interpreter encountered an unexpected error.'
result = f'\n\nerror:\n\n```\n{text}\n```'
finally:
if timeout:
signal.alarm(0)
result = result.lstrip('\n')
return succeed, result
def __call__(self,
command: str,
timeout: Optional[int] = None) -> ActionReturn:
tool_return = ActionReturn(url=None, args=None, type=self.name)
tool_return.args = dict(text=command)
succeed, result = self._call(command, timeout)
if succeed:
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
else:
tool_return.errmsg = repr(result)
tool_return.state = ActionStatusCode.API_ERROR
return tool_return
def extract_code(text):
# Match triple backtick blocks first
triple_match = re.search(r'```[^\n]*\n(.+?)```', text, re.DOTALL)
# Match single backtick blocks second
single_match = re.search(r'`([^`]*)`', text, re.DOTALL)
if triple_match:
text = triple_match.group(1)
elif single_match:
text = single_match.group(1)
else:
try:
text = json5.loads(text)['code']
except Exception:
pass
# If no code blocks found, return original text
return text
def escape_ansi(line):
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
return ansi_escape.sub('', line)
def publish_image_to_local(image_base64: str):
image_file = str(uuid.uuid4()) + '.png'
local_image_file = os.path.join(WORK_DIR, image_file)
png_bytes = base64.b64decode(image_base64)
assert isinstance(png_bytes, bytes)
bytes_io = io.BytesIO(png_bytes)
PIL.Image.open(bytes_io).save(local_image_file, 'png')
return local_image_file
# local test for code interpreter
def get_multiline_input(hint):
print(hint)
print('// Press ENTER to make a new line. Press CTRL-D to end input.')
lines = []
while True:
try:
line = input()
except EOFError: # CTRL-D
break
lines.append(line)
print('// Input received.')
if lines:
return '\n'.join(lines)
else:
return ''
if __name__ == '__main__':
code_interpreter = IPythonInterpreter()
while True:
print(code_interpreter(get_multiline_input('Enter python code:')))
import copy
import io
import signal
from contextlib import redirect_stdout
from typing import Any, Optional
from lagent.actions.base_action import BaseAction
from lagent.schema import ActionReturn, ActionStatusCode
class TimeoutError(Exception):
pass
def handler(signum, frame):
raise TimeoutError()
class GenericRuntime:
GLOBAL_DICT = {}
LOCAL_DICT = None
HEADERS = []
def __init__(self):
self._global_vars = copy.copy(self.GLOBAL_DICT)
self._local_vars = copy.copy(
self.LOCAL_DICT) if self.LOCAL_DICT else None
for c in self.HEADERS:
self.exec_code(c)
def exec_code(self, code_piece: str) -> None:
exec(code_piece, self._global_vars)
def eval_code(self, expr: str) -> Any:
return eval(expr, self._global_vars)
DEFAULT_DESCRIPTION = """用来执行Python代码。代码必须是一个函数,
函数名必须得是 'solution',代码对应你的思考过程。代码实例格式如下:
```python
# import 依赖包
import xxx
def solution():
# 初始化一些变量
variable_names_with_real_meaning = xxx
# 步骤一
mid_variable = func(variable_names_with_real_meaning)
# 步骤 x
mid_variable = func(mid_variable)
# 最后结果
final_answer = func(mid_variable)
return final_answer
```"""
class PythonInterpreter(BaseAction):
"""A Python executor that can execute Python scripts.
Args:
description (str): The description of the action. Defaults to
DEFAULT_DESCRIPTION.
answer_symbol (str, Optional): the answer symbol from LLM
answer_expr (str, Optional): the answer function name of the Python
script. Default to 'solution()'.
answer_from_stdout (boolean): whether the execution results is from
stdout.
name (str, optional): The name of the action. If None, the name will
be class nameDefaults to None.
enable (bool, optional): Whether the action is enabled. Defaults to
True.
disable_description (str, optional): The description of the action when
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
"""
def __init__(self,
description: str = DEFAULT_DESCRIPTION,
answer_symbol: Optional[str] = None,
answer_expr: Optional[str] = 'solution()',
answer_from_stdout: bool = False,
name: Optional[str] = None,
enable: bool = True,
disable_description: Optional[str] = None,
timeout: int = 20) -> None:
super().__init__(description, name, enable, disable_description)
self.answer_symbol = answer_symbol
self.answer_expr = answer_expr
self.answer_from_stdout = answer_from_stdout
self.timeout = timeout
def __call__(self, command: str) -> ActionReturn:
self.runtime = GenericRuntime()
signal.signal(signal.SIGALRM, handler)
signal.alarm(self.timeout)
try:
tool_return = self._call(command)
except TimeoutError as e:
tool_return = ActionReturn(url=None, args=None, type=self.name)
tool_return.errmsg = repr(e)
tool_return.state = ActionStatusCode.API_ERROR
finally:
signal.alarm(0)
return tool_return
def _call(self, command: str) -> ActionReturn:
tool_return = ActionReturn(url=None, args=None, type=self.name)
try:
if '```python' in command:
command = command.split('```python')[1].split('```')[0]
elif '```' in command:
command = command.split('```')[1].split('```')[0]
tool_return.args = dict(text='```python\n' + command + '\n```')
command = command.split('\n')
if self.answer_from_stdout:
program_io = io.StringIO()
with redirect_stdout(program_io):
self.runtime.exec_code('\n'.join(command))
program_io.seek(0)
res = program_io.readlines()[-1]
elif self.answer_symbol:
self.runtime.exec_code('\n'.join(command))
res = self.runtime._global_vars[self.answer_symbol]
elif self.answer_expr:
self.runtime.exec_code('\n'.join(command))
res = self.runtime.eval_code(self.answer_expr)
else:
self.runtime.exec_code('\n'.join(command[:-1]))
res = True
except Exception as e:
tool_return.errmsg = repr(e)
tool_return.type = self.name
tool_return.state = ActionStatusCode.API_ERROR
return tool_return
try:
tool_return.result = dict(text=str(res))
tool_return.state = ActionStatusCode.SUCCESS
except Exception as e:
tool_return.errmsg = repr(e)
tool_return.type = self.name
tool_return.state = ActionStatusCode.API_ERROR
return tool_return
import re
from typing import Union from typing import Union
from lagent.actions import ActionExecutor from lagent.actions import ActionExecutor
...@@ -5,7 +6,7 @@ from lagent.agents.base_agent import BaseAgent ...@@ -5,7 +6,7 @@ from lagent.agents.base_agent import BaseAgent
from lagent.agents.react import ReActProtocol from lagent.agents.react import ReActProtocol
from lagent.llms.base_api import BaseAPIModel from lagent.llms.base_api import BaseAPIModel
from lagent.llms.base_llm import BaseModel from lagent.llms.base_llm import BaseModel
from lagent.schema import ActionReturn, AgentReturn from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
class ReAct(BaseAgent): class ReAct(BaseAgent):
...@@ -32,15 +33,54 @@ class ReAct(BaseAgent): ...@@ -32,15 +33,54 @@ class ReAct(BaseAgent):
action_executor=action_executor, action_executor=action_executor,
protocol=protocol) protocol=protocol)
def reset(self):
"""Reset history."""
self._session_history = []
def opencompass_adapter(self, prompt): def opencompass_adapter(self, prompt):
# adapter for prompt parsing # adapter for prompt parsing
from opencompass.utils.prompt import PromptList
if isinstance(prompt, list): if isinstance(prompt, list):
system_prompt = []
merged_prompt = []
for p in prompt: for p in prompt:
if 'content' in p: tmp_p = p.copy()
p['prompt'] = p.pop('content') if 'content' in tmp_p:
prompt = PromptList(prompt) tmp_p['prompt'] = tmp_p.pop('content')
return prompt if 'role' in tmp_p:
if tmp_p['role'] == 'system':
# skip system prompt
system_prompt.append(tmp_p['prompt'])
continue
# no system for meta template temperaily
if tmp_p['role'] == 'assistant':
tmp_p['role'] = 'BOT'
if tmp_p['role'] == 'user':
# merge previous system prompt to user
system_str = ''.join(system_prompt)
tmp_p['prompt'] = system_str + tmp_p['prompt']
tmp_p['role'] = 'HUMAN'
system_prompt = []
merged_prompt.append(tmp_p)
# merge if system still exists
if system_prompt:
if 'role' in merged_prompt[-1]:
if merged_prompt[-1]['role'] == 'HUMAN':
# append to the final human prompt
merged_prompt[-1]['prompt'] += ''.join(system_prompt)
else:
# create a human prompt behind
merged_prompt.append(
dict(role='HUMAN', prompt=''.join(system_prompt)))
from opencompass.utils.prompt import PromptList
new_prompt = PromptList()
# adapter for meta template
new_prompt.append(dict(section='round', pos='begin'))
new_prompt.extend(merged_prompt)
new_prompt.append(dict(section='round', pos='end'))
return new_prompt
def chat(self, message: str) -> AgentReturn: def chat(self, message: str) -> AgentReturn:
self._inner_history = [] self._inner_history = []
...@@ -61,6 +101,13 @@ class ReAct(BaseAgent): ...@@ -61,6 +101,13 @@ class ReAct(BaseAgent):
content=response)) content=response))
thought, action, action_input = self._protocol.parse( thought, action, action_input = self._protocol.parse(
response, self._action_executor) response, self._action_executor)
# TODO: hard code here
action_input = re.sub('<eoa>', '', action_input)
if 'tensorflow' in action_input:
# skip tensorflow currently
break
action_return: ActionReturn = self._action_executor( action_return: ActionReturn = self._action_executor(
action, action_input) action, action_input)
action_return.thought = thought action_return.thought = thought
...@@ -79,3 +126,74 @@ class ReAct(BaseAgent): ...@@ -79,3 +126,74 @@ class ReAct(BaseAgent):
self._session_history.append( self._session_history.append(
dict(role='assistant', content=agent_return.response)) dict(role='assistant', content=agent_return.response))
return agent_return return agent_return
class CIReAct(ReAct):
"""Code Interpreter version of ReAct. The success state is different from
ReAct.
Args:
llm (BaseModel or BaseAPIModel): a LLM service which can chat
and act as backend.
action_executor (ActionExecutor): an action executor to manage
all actions and their response.
protocol (ReActProtocol): a wrapper to generate prompt and
parse the response from LLM / actions.
max_turn (int): the maximum number of trails for LLM to generate
plans that can be successfully parsed by ReWOO protocol.
"""
def reset(self):
"""Reset history and reset action if suit the case."""
self._session_history = []
# hard code here
from opencompass.lagent.actions.ipython_interpreter import \
IPythonInterpreter
b = IPythonInterpreter()
b.reset()
def chat(self, message: str) -> AgentReturn:
self._inner_history = []
# append the user message for session history
self._session_history.append(dict(role='user', content=message))
agent_return = AgentReturn()
force_stop = False
default_response = '对不起,我无法回答你的问题'
for turn in range(self.max_turn):
prompt = self._protocol.format(
chat_history=self.session_history,
inner_step=self._inner_history,
action_executor=self._action_executor,
force_stop=force_stop)
prompt = self.opencompass_adapter(prompt)
# allow single generation
response = self._llm.generate_from_template([prompt], 512)[0]
self._inner_history.append(dict(role='assistant',
content=response))
thought, action, action_input = self._protocol.parse(
response, self._action_executor)
action_return: ActionReturn = self._action_executor(
action, action_input)
action_return.thought = thought
agent_return.actions.append(action_return)
if action_return.state == ActionStatusCode.SUCCESS:
# if success, stash model response and system response
self._session_history.append(
dict(role='assistant', content=action_return.args['text']))
self._session_history.append(
dict(
role='system',
content=self._protocol.format_response(action_return)))
agent_return.response = action_return.result['text']
return agent_return
elif action_return.type == self._action_executor.invalid_action.name: # noqa
action_return.errmsg = 'The action is invalid, please check the action name.' # noqa
self._inner_history.append(
dict(role='system',
content=self._protocol.format_response(action_return)))
if turn == self.max_turn - 1:
force_stop = True
agent_return.response = default_response
self._session_history.append(
dict(role='assistant', content=agent_return.response))
return agent_return
...@@ -14,12 +14,19 @@ class LagentAgent: ...@@ -14,12 +14,19 @@ class LagentAgent:
https://github.com/InternLM/lagent. https://github.com/InternLM/lagent.
""" """
def __init__(self, agent_type, llm, actions=None, protocol=None, **kwargs): def __init__(self,
agent_type,
llm,
actions=None,
protocol=None,
mutli_rounds=False,
**kwargs):
llm = REGISTRY.build(llm) llm = REGISTRY.build(llm)
agent_cfg = {'type': agent_type, 'llm': llm, **kwargs} agent_cfg = {'type': agent_type, 'llm': llm, **kwargs}
if actions is not None: if actions is not None:
from lagent.actions import ActionExecutor from lagent.actions import ActionExecutor
executor = ActionExecutor( executor = ActionExecutor(
[REGISTRY.build(action) for action in actions]) [REGISTRY.build(action) for action in actions])
agent_cfg['action_executor'] = executor agent_cfg['action_executor'] = executor
...@@ -28,6 +35,7 @@ class LagentAgent: ...@@ -28,6 +35,7 @@ class LagentAgent:
agent_cfg['protocol'] = protocol agent_cfg['protocol'] = protocol
self.agent = REGISTRY.build(agent_cfg) self.agent = REGISTRY.build(agent_cfg)
self.mutli_rounds = mutli_rounds
def add_example(self, example): def add_example(self, example):
# format example in protocol if needed # format example in protocol if needed
...@@ -39,10 +47,11 @@ class LagentAgent: ...@@ -39,10 +47,11 @@ class LagentAgent:
get_logger().warning('Protocal template does not have example' get_logger().warning('Protocal template does not have example'
' placeholder, please check your template.') ' placeholder, please check your template.')
def chat(self, user_input, ice=None) -> Tuple[str, List[dict]]: def one_round_chat(self, user_input, ice=None) -> Tuple[str, List[dict]]:
"""One round chat with agent."""
from lagent.schema import ActionReturn, AgentReturn from lagent.schema import ActionReturn, AgentReturn
generation: AgentReturn = self.agent.chat(user_input) generation: AgentReturn = self.agent.chat(user_input)
self.agent._session_history = [] # clear agent history
answer = generation.response answer = generation.response
steps = [] steps = []
...@@ -60,11 +69,26 @@ class LagentAgent: ...@@ -60,11 +69,26 @@ class LagentAgent:
)) ))
return answer, steps return answer, steps
def chat(self, user_input, ice=None) -> Tuple[str, List[dict]]:
"""Chat with agent."""
if self.mutli_rounds:
steps = []
for single_input in user_input:
answer, one_round_steps = self.one_round_chat(single_input)
steps.append(one_round_steps)
else:
answer, steps = self.one_round_chat(user_input)
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information.""" # noqa self.agent.reset() # clear agent history
return answer, steps
FORCE_STOP_PROMPT_EN = (
"""You should directly give results based on history information.""" # noqa
)
FEWSHOT_INSTRUCTION = """\ FEWSHOT_INSTRUCTION = """\
You are a assistant who can utilize external tools. You are an assistant who can utilize external tools.
{{tool_description}} {{tool_description}}
To use a tool, please use the following format: To use a tool, please use the following format:
``` ```
...@@ -82,14 +106,12 @@ please using the following format to reply: ...@@ -82,14 +106,12 @@ please using the following format to reply:
{{thought}} the thought process to get the final answer {{thought}} the thought process to get the final answer
{{finish}} final answer {{finish}} final answer
``` ```
Example:
{example} {example}
Begin! Begin!
""" # noqa """ # noqa
PYTHON_INTERPRETER_DESCRIPTION = '''\ PYTHON_INTERPRETER_DESCRIPTION = """\
It can run a Python code. The code must be a valid code that contains only python method, and the method' name must be 'solution' and returns a dict, which key is variable name. The libraries I recommend are sympy and scipy. the format is: It can run a Python code. The code must be a valid code that contains only python method, and the method' name must be 'solution' and returns a dict, which key is variable name. The libraries I recommend are sympy and scipy. the format is:
```python ```python
# import packages # import packages
...@@ -102,21 +124,28 @@ def solution(): ...@@ -102,21 +124,28 @@ def solution():
# final answer # final answer
final_answer = func(mid_variable) final_answer = func(mid_variable)
return final_answer return final_answer
```''' # noqa ```""" # noqa
class CodeAgent: class CodeAgent:
"""Agent wrapper for Lagent.""" """Code Agent wrapper for Lagent."""
def __new__(self, llm, **kwargs): def __new__(self, llm, **kwargs):
from lagent.actions import PythonInterpreter
from lagent.agents.react import ReActProtocol from lagent.agents.react import ReActProtocol
from opencompass.lagent.actions.python_interpreter import \
PythonInterpreter
mutli_rounds = kwargs.pop('mutli_rounds', False)
agent_type = kwargs.pop('agent_type', ReAct) agent_type = kwargs.pop('agent_type', ReAct)
max_turn = kwargs.pop('max_turn', 3) max_turn = kwargs.pop('max_turn', 3)
actions = kwargs.pop('actions', [ actions = kwargs.pop(
'actions',
[
dict(type=PythonInterpreter, dict(type=PythonInterpreter,
description=PYTHON_INTERPRETER_DESCRIPTION), description=PYTHON_INTERPRETER_DESCRIPTION),
]) ],
)
protocol = kwargs.pop( protocol = kwargs.pop(
'protocol', 'protocol',
dict( dict(
...@@ -124,10 +153,12 @@ class CodeAgent: ...@@ -124,10 +153,12 @@ class CodeAgent:
call_protocol=FEWSHOT_INSTRUCTION, call_protocol=FEWSHOT_INSTRUCTION,
force_stop=FORCE_STOP_PROMPT_EN, force_stop=FORCE_STOP_PROMPT_EN,
finish=dict(role='FINISH', begin='Final Answer:', end='\n'), finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
)) ),
)
return LagentAgent(agent_type=agent_type, return LagentAgent(agent_type=agent_type,
llm=llm, llm=llm,
max_turn=max_turn, max_turn=max_turn,
actions=actions, actions=actions,
protocol=protocol, protocol=protocol,
mutli_rounds=mutli_rounds,
**kwargs) **kwargs)
...@@ -77,8 +77,13 @@ class AgentInferencer(BaseInferencer): ...@@ -77,8 +77,13 @@ class AgentInferencer(BaseInferencer):
logger.info('Starting inference process...') logger.info('Starting inference process...')
for idx, ice_indices in tqdm(enumerate(ice_idx_list[start:], start), for idx, ice_indices in tqdm(enumerate(ice_idx_list[start:], start),
disable=not self.is_main_process): disable=not self.is_main_process):
user_input = retriever.generate_prompt_for_generate_task( # TODO: This will break the Prompt template
idx, ice='', prompt_template=prompt_template) # get user input directly without formatting prompt
#
# user_input = retriever.generate_prompt_for_generate_task(
# idx, ice='', prompt_template=prompt_template)
user_input = retriever.dataset_reader.dataset['test'][
retriever.dataset_reader.input_columns[0]][idx]
gold = retriever.dataset_reader.dataset['test'][ gold = retriever.dataset_reader.dataset['test'][
retriever.dataset_reader.output_column][idx] retriever.dataset_reader.output_column][idx]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment