Unverified Commit e78857ac authored by Hubert's avatar Hubert Committed by GitHub
Browse files

[Sync] minor test (#683)

parent dd4318f6
...@@ -29,5 +29,6 @@ models = [ ...@@ -29,5 +29,6 @@ models = [
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<eoa>',
) )
] ]
...@@ -29,5 +29,6 @@ models = [ ...@@ -29,5 +29,6 @@ models = [
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
) )
] ]
...@@ -29,5 +29,6 @@ models = [ ...@@ -29,5 +29,6 @@ models = [
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
) )
] ]
...@@ -22,12 +22,14 @@ models = [ ...@@ -22,12 +22,14 @@ models = [
padding_side='left', padding_side='left',
truncation_side='left', truncation_side='left',
trust_remote_code=True, trust_remote_code=True,
use_fast=False,), use_fast=False,
),
pad_token_id=151643, pad_token_id=151643,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
) )
] ]
...@@ -22,12 +22,14 @@ models = [ ...@@ -22,12 +22,14 @@ models = [
padding_side='left', padding_side='left',
truncation_side='left', truncation_side='left',
trust_remote_code=True, trust_remote_code=True,
use_fast=False,), use_fast=False,
),
pad_token_id=151643, pad_token_id=151643,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
) )
] ]
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
mathbench_summary_groups = [
{
'name': 'mathbench-college',
'subsets': [
['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-cloze_en', 'accuracy'],
]
},
{
'name': 'mathbench-high',
'subsets': [
['mathbench-high-single_choice_cn', 'acc_1'],
['mathbench-high-single_choice_en', 'acc_1'],
]
},
{
'name': 'mathbench-middle',
'subsets': [
['mathbench-middle-single_choice_cn', 'acc_1'],
]
},
{
'name': 'mathbench-primary',
'subsets': [
['mathbench-primary-cloze_cn', 'accuracy'],
]
},
{
'name': 'mathbench',
'subsets': [
'mathbench-college',
'mathbench-high',
'mathbench-middle',
'mathbench-primary',
],
},
{
'name': 'mathbench-college-circular',
'subsets': [
['mathbench-college-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-high-circular',
'subsets': [
['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'],
]
},
{
'name': 'mathbench-middle-circular',
'subsets': [
['mathbench-middle-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-circular',
'subsets': [
'mathbench-college-circular',
'mathbench-high-circular',
'mathbench-middle-circular',
],
},
{
'name': 'mathbench-circular-and-cloze',
'subsets': [
'mathbench-high-circular',
'mathbench-middle-circular',
'mathbench-circular',
'mathbench-college-cloze_en',
'mathbench-primary-cloze_cn',
],
}
]
summarizer = dict(
dataset_abbrs=[
'######## GSM8K-Agent Accuracy ########', # category
['gsm8k-agent', 'follow_acc'],
['gsm8k-agent', 'reasoning_acc'],
['gsm8k-agent', 'code_acc'],
['gsm8k-agent', 'action_pct'],
'######## MATH-Agent Accuracy ########', # category
['math-agent', 'follow_acc'],
['math-agent', 'reasoning_acc'],
['math-agent', 'code_acc'],
['math-agent', 'action_pct'],
'######## MathBench-Agent Accuracy ########', # category
['mathbench-college-single_choice_cn-agent', 'acc_1'],
['mathbench-college-cloze_en-agent', 'accuracy'],
['mathbench-high-single_choice_cn-agent', 'acc_1'],
['mathbench-high-single_choice_en-agent', 'acc_1'],
['mathbench-middle-single_choice_cn-agent', 'acc_1'],
['mathbench-primary-cloze_cn-agent', 'accuracy'],
'######## MathBench-Agent CircularEval ########', # category
['mathbench-college-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_en-agent', 'perf_4'],
['mathbench-middle-single_choice_cn-agent', 'perf_4'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
...@@ -2,13 +2,15 @@ import json ...@@ -2,13 +2,15 @@ import json
import os import os
import os.path as osp import os.path as osp
import re import re
import subprocess
from collections import defaultdict
from typing import List, Optional from typing import List, Optional
import numpy as np import numpy as np
from datasets import Dataset from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset from .base import BaseDataset
...@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict: ...@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
with open(file, 'r') as f: with open(file, 'r') as f:
notebook = json.load(f) notebook = json.load(f)
example = notebook['cells'] example = notebook['cells']
metadata = notebook['metadata']
modules = metadata.get('modules', [])
if modules:
# these two annotations should be the same
assert len(modules) == len(metadata.get('step_types'))
# reformat annotations
modules = [[_m.strip() for _m in _modules.split('&')]
for _modules in modules]
questions = [] questions = []
source_codes = []
outputs = [] outputs = []
tags = [] tags = []
for cell in example: for cell in example:
if cell['cell_type'] == 'markdown': if cell['cell_type'] == 'markdown':
text = ''.join(cell['source']) text = ''.join(cell['source']).strip()
if modules:
_modules = modules.pop(0)
text += f"Please use {' and '.join(_modules)} modules."
text = text.strip() + '\n'
# append the formatted text # append the formatted text
questions.append(text) questions.append(text)
elif cell['cell_type'] == 'code': elif cell['cell_type'] == 'code':
source_codes.append(''.join(cell['source']))
if cell['outputs'] and 'data' in cell['outputs'][-1]: if cell['outputs'] and 'data' in cell['outputs'][-1]:
if 'image/png' in cell['outputs'][-1]['data']: if 'image/png' in cell['outputs'][-1]['data']:
# skip vis temporarily due to lack of evaluation # skip vis temporarily due to lack of evaluation
...@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict: ...@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
outputs.append(''.join( outputs.append(''.join(
cell['outputs'][-1]['data']['text/plain'])) cell['outputs'][-1]['data']['text/plain']))
else: else:
tags.append('executable') tags.append('exec')
outputs.append(None) outputs.append(None)
return dict( return dict(
experiment=file, experiment=file,
questions=sum(([ questions=sum(([
dict(role='user', content=question), dict(role='user', content=question),
dict(role='assistant', content=output) dict(role='assistant', content=source_code)
] for question, output in zip(questions, outputs)), []), ] for question, source_code in zip(questions, source_codes)), []),
references=dict(outputs=outputs, tags=tags, experiment=file), references=dict(outputs=outputs,
tags=tags,
metadata=metadata,
experiment=file),
) )
...@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset): ...@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
"""Load whole dataset.""" """Load whole dataset."""
assert os.path.exists(path), f'Path {path} does not exist.'
data_list = [] data_list = []
for cwd, dirs, files in os.walk(path): for cwd, dirs, files in os.walk(path):
dirs.sort() dirs.sort()
...@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset. """Evaluator for CI dataset.
Args: Args:
text_evaluator (optional, dict): The text evaluator for text result
comparison[]. Defaults to None, which use Rouge as defaults.
Please notice that a extra key for `metric_name` should be set
to get the exact metric result, such as `rouge1`.
output_dir (optional, str): The directory to save experiment output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format. files in a markdown or notebook format.
with_ipynb (bool): Generate ipynb correspondingly.
Defaults to False.
user_data_dir (str): The directory to load local files. user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir. `USER_DATA_DIR` to get the data dir.
""" """
def __init__(self, def __init__(self,
text_evaluator: Optional[dict] = None,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
with_ipynb: bool = False,
user_data_dir: str = 'ENV') -> None: user_data_dir: str = 'ENV') -> None:
if text_evaluator is None:
from opencompass.openicl.icl_evaluator import RougeEvaluator
self.text_evaluator = ICL_EVALUATORS.build(
dict(type=RougeEvaluator))
self.text_eval_metric = 'rouge1'
else:
self.text_eval_metric = text_evaluator.pop('metric_name')
self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
# TODO: should use work dir for this task. # TODO: should use work dir for this task.
self.output_dir = output_dir self.output_dir = output_dir
self.user_data_dir = self.check_user_data_dir(user_data_dir)
self.with_ipynb = with_ipynb
self.TAG_MAPPING = {
'exec': ('executable', self.valid_step),
'general': ('general_correct', self.correct_step),
'num': ('numeric_correct', self.correct_step),
'text': ('text_score', self.text_step),
'vis': ('vis_sim', self.vis_similarity_step),
}
def check_user_data_dir(self, user_data_dir):
if user_data_dir == 'ENV': if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '') user_data_dir = os.environ.get('USER_DATA_DIR', '')
self.user_data_dir = user_data_dir user_data_dir = user_data_dir.rstrip('/')
basename = osp.basename(user_data_dir)
if basename and basename != 'data':
user_data_dir = osp.join(user_data_dir, 'data')
assert osp.exists(user_data_dir), \
f'a subfolder named `data` should exist under {user_data_dir}.'
elif basename:
assert osp.exists(user_data_dir), \
f'{user_data_dir} does not exist.'
return user_data_dir
@staticmethod @staticmethod
def valid_step(step): def valid_step(step):
...@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
# Fall back to False # Fall back to False
return False return False
def text_step(self, step, target):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['result']:
try:
pred = action['result']['text']
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
if match:
out = match.group(1)
score = self.text_evaluator.score([out], [target])
return score[self.text_eval_metric] / 100
except Exception:
return False
# Fall back to False
return False
@staticmethod @staticmethod
def vis_similarity_step(step, target): def vis_similarity_step(step, target):
"""Whether the step output image has the same structure similarity with """Whether the step output image has the same structure similarity with
...@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
'the conversion processes.') 'the conversion processes.')
check_jupytext() check_jupytext()
p_list = []
from opencompass.lagent.actions.ipython_interpreter import extract_code from opencompass.lagent.actions.ipython_interpreter import extract_code
for idx, (example_origin_prompt, for idx, (example_origin_prompt,
example_steps) in enumerate(zip(origin_prompt, steps)): example_steps) in enumerate(zip(origin_prompt, steps)):
...@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
f.writelines(markdown_lines) f.writelines(markdown_lines)
# TODO: be careful for this # TODO: be careful for this
# The result might be different with infer process
# please check carefully
# convert markdown to ipynb and exectue with error tolerance # convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen( if self.with_ipynb:
# "jupytext --to ipynb --pipe-fmt ipynb " p = subprocess.Popen(
# "--pipe 'jupyter nbconvert --to ipynb --execute " 'jupytext --to ipynb --pipe-fmt ipynb '
# f"--allow-errors --stdin --stdout' {md_file}", "--pipe 'jupyter nbconvert --to ipynb --execute "
# shell=True) f"--allow-errors --stdin --stdout' {md_file}",
shell=True)
p_list.append(p)
# TODO: async wait
for p in p_list:
p.wait()
def set_data_dir(self, work_dir): def set_data_dir(self, work_dir):
"""Set work directory and link data files for save notebook results.""" """Set work directory and link data files for save notebook results."""
if self.user_data_dir: if self.user_data_dir:
if self.user_data_dir.endswith('/'): basename = osp.basename(self.user_data_dir)
basename = osp.basename(osp.split(self.user_data_dir)[0])
else:
basename = osp.basename(self.user_data_dir)
if not osp.exists(osp.join(self.output_dir, basename)): if not osp.exists(osp.join(self.output_dir, basename)):
os.symlink(self.user_data_dir, os.symlink(self.user_data_dir,
osp.join(self.output_dir, basename)) osp.join(self.output_dir, basename))
...@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
"""Change work directory and keep the symlink.""" """Change work directory and keep the symlink."""
os.chdir(work_dir) os.chdir(work_dir)
def single_exp(self, gold, steps):
tags = gold['tags']
outputs = gold['outputs']
metadata = gold['metadata']
hard_tags = metadata.get('step_types', [])
if hard_tags:
tags = hard_tags
# executable: exec succeed
# general_correct: general correct
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result = defaultdict(list)
for tag, step, output in zip(tags, steps, outputs):
# check whether this step is valid
result['executable'].append(self.valid_step(step))
if tag != 'exec':
key, func = self.TAG_MAPPING[tag]
result[key].append(func(step, output))
# add missing metric for better analyse if not exists
if hard_tags:
check_tags = ['exec', 'num', 'text', 'vis']
else:
check_tags = ['exec', 'general', 'vis']
for tag in check_tags:
key = self.TAG_MAPPING[tag][0]
if key not in result:
result[key] = []
return result
def get_output_dir(self):
"""Get output dir from eval task.
Notice: output dir should be in format xxx/data.
All the needed files should be
"""
# hard hack for get output dir from eval task
if hasattr(self, '_out_dir') and self.output_dir is None:
self.output_dir = self._out_dir
def score(self, predictions: List, references: List, steps: List, def score(self, predictions: List, references: List, steps: List,
origin_prompt: List): origin_prompt: List):
"""Calculate accuracy.""" """Calculate accuracy."""
cwd = os.getcwd() cwd = os.getcwd()
self.get_output_dir()
if self.output_dir: if self.output_dir:
if not osp.exists(self.output_dir): if not osp.exists(self.output_dir):
os.makedirs(self.output_dir) os.makedirs(self.output_dir)
...@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator): ...@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
self.save_results(origin_prompt, steps) self.save_results(origin_prompt, steps)
self.unset_data_dir(cwd) self.unset_data_dir(cwd)
num_cells_list = [] total_results = defaultdict(float)
num_general_list = [] total_scores = defaultdict(float)
passed_list = [] total_nums = defaultdict(int)
correct_list = []
vis_list = []
for gold, single_steps in zip(references, steps): for gold, single_steps in zip(references, steps):
tags = gold['tags'] result = self.single_exp(gold, single_steps)
outputs = gold['outputs']
num_cells = len(tags)
num_general = sum([tag == 'general' for tag in tags])
passed = sum([self.valid_step(step) for step in single_steps])
correct = 0
vis_sim = []
for tag, step, output in zip(tags, single_steps, outputs):
if tag == 'general':
correct += self.correct_step(step, output)
elif tag == 'vis':
vis_sim.append(self.vis_similarity_step(step, output))
num_cells_list.append(num_cells)
num_general_list.append(num_general)
passed_list.append(passed)
correct_list.append(correct)
if vis_sim:
vis_list.append(sum(vis_sim) / len(vis_sim))
else:
vis_list.append(-1)
if len([v for v in vis_list if v >= 0]) > 0: for k, v in result.items():
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len( total_scores[k] += sum(v)
[v for v in vis_list if v >= 0]) total_nums[k] += len(v)
else:
# not valid
visualize_similarity = -1
if sum(num_general_list) > 0: for k, v in total_scores.items():
general_accuracy = sum(correct_list) / sum(num_general_list) if total_nums[k] > 0:
else: total_results[k] = total_scores[k] / total_nums[k] * 100
# not valid else:
general_accuracy = -1 total_results[k] = -1
result = dict( return total_results
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
general_accuracy=general_accuracy * 100,
visualize_similarity=visualize_similarity * 100,
num_cells_list=num_cells_list,
num_general_list=num_general_list,
passed_list=passed_list,
correct_list=correct_list,
vis_list=vis_list,
)
return result
...@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset): ...@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
if line['label'] == '-':
continue
data.append(line) data.append(line)
return Dataset.from_list(data) return Dataset.from_list(data)
......
...@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str: ...@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
return text return text
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
def ds1000_completion_postprocess(text: str) -> str:
text += '</code>'
match = re.search('(.*?)</code>', text, re.DOTALL)
if match:
text = match.group(1)
return text
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib') @TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
def ds1000_matplotlib_postprocess(text: str) -> str: def ds1000_matplotlib_postprocess(text: str) -> str:
text = ds1000_postprocess(text) text = ds1000_postprocess(text)
......
...@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator): ...@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
reasoning_acc=100 * reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total, (reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) / total, code_acc=100 * (code_scope + final_scope) / total,
action_acc=100 * (action_scope + final_scope) / total, action_pct=100 * (action_scope + final_scope) / total,
) )
return result return result
...@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset): ...@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC'] circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
data = [] data = []
with open(path, 'r') as infile: with open(path, 'r', encoding='utf-8') as infile:
for id, line in enumerate(infile): for id, line in enumerate(infile):
entry = json.loads(line) entry = json.loads(line)
if 'cloze' in name: if 'cloze' in name:
......
...@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset): ...@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
prompt = line['sentence'] prompt = line['sentence']
dataset_list.append({ continue_prompt = prompt.split('_')
'opt1': data_item = {
prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': 'opt2': prompt.replace('_', line['option2']),
prompt.replace('_', line['option2']), 'answer': line['answer'],
'answer': 'cont': continue_prompt[1]
line['answer'] }
}) dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list) dataset_list = Dataset.from_list(dataset_list)
return dataset_list return dataset_list
...@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset): ...@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
prompt = line['sentence'] prompt = line['sentence']
answer = line['answer'] answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL' answer = ' AB'[int(answer)] if answer != '' else 'NULL'
dataset_list.append({ data_item = {
'opt1': 'opt1': prompt.replace('_', line['option1']),
prompt.replace('_', line['option1']), 'opt2': prompt.replace('_', line['option2']),
'opt2': 'answer': answer,
prompt.replace('_', line['option2']), }
'answer': dataset_list.append(data_item)
answer
})
dataset_list = Dataset.from_list(dataset_list) dataset_list = Dataset.from_list(dataset_list)
return dataset_list return dataset_list
...@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction): ...@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
it is disabled. Defaults to None. it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution. timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20. Defaults to 20.
trim_output (int, optional): Max characters restriction of ipython
outputs. If None, do not perform any trim.
TODO: Notice that, this is not token len. Anf trim strategies
might be added later. Defaults to 1024.
user_data_dir (str): Specified the user data directory for files user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable. loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`. Defaults to `ENV`.
...@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction): ...@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
enable: bool = True, enable: bool = True,
disable_description: Optional[str] = None, disable_description: Optional[str] = None,
timeout: int = 20, timeout: int = 20,
trim_output: Optional[int] = 1024,
user_data_dir: str = 'ENV') -> None: user_data_dir: str = 'ENV') -> None:
super().__init__(description, name, enable, disable_description) super().__init__(description, name, enable, disable_description)
...@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction): ...@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
user_data_dir = os.environ.get('USER_DATA_DIR', '') user_data_dir = os.environ.get('USER_DATA_DIR', '')
if user_data_dir: if user_data_dir:
user_data_dir = os.path.dirname(user_data_dir) # user_data_dir = os.path.dirname(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')" user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
self._initialized = False self._initialized = False
self.trim_output = trim_output
if not os.path.exists(WORK_DIR): if not os.path.exists(WORK_DIR):
os.mkdir(WORK_DIR) os.mkdir(WORK_DIR)
...@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction): ...@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
if image: if image:
result += f'\n\n{image}' result += f'\n\n{image}'
if finished: if finished:
# in case output text too long
# might need better design later
if self.trim_output and len(result) > self.trim_output:
ellip = '......'
half_len = int((self.trim_output - len(ellip)) / 2)
result = result[:half_len] + ellip + result[-half_len:]
return succeed, result return succeed, result
try: try:
...@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction): ...@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
command: str, command: str,
timeout: Optional[int] = None) -> ActionReturn: timeout: Optional[int] = None) -> ActionReturn:
tool_return = ActionReturn(url=None, args=None, type=self.name) tool_return = ActionReturn(url=None, args=None, type=self.name)
tool_return.args = dict(text=command) extracted_command = extract_code(command)
succeed, result = self._call(command, timeout) tool_return.args = dict(text=command, extract_code=extracted_command)
if succeed: if extracted_command:
tool_return.result = dict(text=result) succeed, result = self._call(extracted_command, timeout)
tool_return.state = ActionStatusCode.SUCCESS if succeed:
if not result:
result = 'The code is succeed without any outputs.'
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
else:
tool_return.errmsg = repr(result)
tool_return.state = ActionStatusCode.API_ERROR
else: else:
tool_return.errmsg = repr(result) tool_return.errmsg = 'The input code is empty. Please follow the format.' # noqa
tool_return.state = ActionStatusCode.API_ERROR tool_return.state = ActionStatusCode.API_ERROR
return tool_return return tool_return
......
...@@ -115,6 +115,20 @@ class BaseModel: ...@@ -115,6 +115,20 @@ class BaseModel:
inputs = self.parse_template(templates, mode='ppl') inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length) return self.get_ppl(inputs, mask_length)
def get_loglikelihood_from_template(self,
templates: List[PromptType],
conts: List[str],
mask_length=None):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_loglikelihood(inputs, conts, mask_length)
def generate_from_template(self, templates: List[PromptType], def generate_from_template(self, templates: List[PromptType],
max_out_len: int, **kwargs): max_out_len: int, **kwargs):
"""Generate completion from a list of templates. """Generate completion from a list of templates.
......
import re import re
import sys import sys
import threading import threading
import time
import warnings import warnings
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from queue import Queue
from time import sleep from time import sleep
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
...@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel): ...@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
def __init__(self, def __init__(self,
path: str, path: str,
query_per_second: int = 1, query_per_second: int = 1,
rpm_verbose: bool = False,
retry: int = 2, retry: int = 2,
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
...@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel): ...@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
self.meta_template = meta_template self.meta_template = meta_template
self.retry = retry self.retry = retry
self.query_per_second = query_per_second self.query_per_second = query_per_second
self.token_bucket = TokenBucket(query_per_second) self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
self.template_parser = APITemplateParser(meta_template) self.template_parser = APITemplateParser(meta_template)
self.logger = get_logger() self.logger = get_logger()
self.generation_kwargs = generation_kwargs self.generation_kwargs = generation_kwargs
...@@ -422,10 +425,13 @@ class TokenBucket: ...@@ -422,10 +425,13 @@ class TokenBucket:
query_per_second (float): The rate of the token bucket. query_per_second (float): The rate of the token bucket.
""" """
def __init__(self, rate): def __init__(self, rate, verbose=False):
self._rate = rate self._rate = rate
self._tokens = threading.Semaphore(0) self._tokens = threading.Semaphore(0)
self.started = False self.started = False
self._request_queue = Queue()
self.logger = get_logger()
self.verbose = verbose
def _add_tokens(self): def _add_tokens(self):
"""Add tokens to the bucket.""" """Add tokens to the bucket."""
...@@ -440,3 +446,12 @@ class TokenBucket: ...@@ -440,3 +446,12 @@ class TokenBucket:
self.started = True self.started = True
threading.Thread(target=self._add_tokens, daemon=True).start() threading.Thread(target=self._add_tokens, daemon=True).start()
self._tokens.acquire() self._tokens.acquire()
if self.verbose:
cur_time = time.time()
while not self._request_queue.empty():
if cur_time - self._request_queue.queue[0] > 60:
self._request_queue.get()
else:
break
self._request_queue.put(cur_time)
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
...@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union ...@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
import numpy as np import numpy as np
import torch import torch
import transformers
from opencompass.models.base import BaseModel from opencompass.models.base import BaseModel
from opencompass.models.base_api import APITemplateParser from opencompass.models.base_api import APITemplateParser
...@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList ...@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str] PromptType = Union[PromptList, str]
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
batch_size: int,
):
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence,
add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# compare the last len(stop) tokens
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if done:
continue
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
@MODELS.register_module() @MODELS.register_module()
class HuggingFace(BaseModel): class HuggingFace(BaseModel):
"""Model wrapper around HuggingFace models. """Model wrapper around HuggingFace models.
...@@ -194,7 +222,10 @@ class HuggingFace(BaseModel): ...@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
self.model.config.eos_token_id = 2 self.model.config.eos_token_id = 2
self.model.config.pad_token_id = self.tokenizer.pad_token_id self.model.config.pad_token_id = self.tokenizer.pad_token_id
def generate(self, inputs: List[str], max_out_len: int, def generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]: **kwargs) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
...@@ -212,9 +243,12 @@ class HuggingFace(BaseModel): ...@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
max_out_len=max_out_len, max_out_len=max_out_len,
**generation_kwargs) **generation_kwargs)
else: else:
return sum((self._single_generate( return sum(
inputs=[input_], max_out_len=max_out_len, **generation_kwargs) (self._single_generate(inputs=[input_],
for input_ in inputs), []) max_out_len=max_out_len,
stopping_criteria=stopping_criteria,
**generation_kwargs)
for input_ in inputs), [])
def _batch_generate(self, inputs: List[str], max_out_len: int, def _batch_generate(self, inputs: List[str], max_out_len: int,
**kwargs) -> List[str]: **kwargs) -> List[str]:
...@@ -275,7 +309,10 @@ class HuggingFace(BaseModel): ...@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
decodeds = [token.split(self.end_str)[0] for token in decodeds] decodeds = [token.split(self.end_str)[0] for token in decodeds]
return decodeds return decodeds
def _single_generate(self, inputs: List[str], max_out_len: int, def _single_generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]: **kwargs) -> List[str]:
"""Support for single prompt inference. """Support for single prompt inference.
...@@ -319,6 +356,19 @@ class HuggingFace(BaseModel): ...@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
max_length=self.max_seq_len - max_length=self.max_seq_len -
max_out_len)['input_ids'] max_out_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device) input_ids = torch.tensor(input_ids, device=self.model.device)
if stopping_criteria:
# Construct huggingface stopping criteria
stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
stopping_criteria = transformers.StoppingCriteriaList([
*[
MultiTokenEOSCriteria(sequence, self.tokenizer,
input_ids.shape[0])
for sequence in stopping_criteria
],
])
kwargs['stopping_criteria'] = stopping_criteria
# To accommodate the PeftModel, parameters should be passed in # To accommodate the PeftModel, parameters should be passed in
# key-value format for generate. # key-value format for generate.
outputs = self.model.generate(input_ids=input_ids, outputs = self.model.generate(input_ids=input_ids,
...@@ -434,6 +484,71 @@ class HuggingFace(BaseModel): ...@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
return ce_loss return ce_loss
def get_loglikelihood(
self,
inputs: List[str],
conts: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get loglikelihood scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
conts (List[str]): A list of strings: slices after the space.
NOT SUPPORT mask_length YET!
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of loglikelihood scores.
"""
assert mask_length is None, 'Not support mask_length yet.'
if self.batch_padding and len(inputs) > 1:
raise NotImplementedError('Batch padding is not supported yet.')
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return np.array([
self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
for idx in range(len(inputs))
])
def _get_loglikelihood(self, inputs: str, conts: str) -> float:
"""Get loglikelihood scores given input string and continuation string.
Args:
inputs (str): string.
conts (str): strings: slices after the space.
Returns:
float: loglikelihood scores.
"""
input_ids = self.tokenizer(inputs,
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
context_ids = self.tokenizer(inputs.replace(conts, ''),
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
cont_ids = input_ids[len(context_ids):]
output = self.model(input_ids.unsqueeze(0))
logits = output['logits'][:, :-1]
logits = torch.nn.functional.log_softmax(logits, dim=-1)
contlen = cont_ids.shape[0]
logits = logits[:, -contlen:, :]
# Reducing the dimension will lead to a wrong outcome
logits_gather = torch.gather(
logits, 2,
cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq]
# Answer: sum the likelihood of each token in continuation
answer = float(logits_gather.detach().cpu().sum())
return answer
def get_token_len(self, prompt: str) -> int: def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings. """Get lengths of the tokenized strings.
...@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace): ...@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
'role': { 'role': {
'HUMAN': 'user', 'HUMAN': 'user',
'BOT': 'assistant', 'BOT': 'assistant',
'SYSTEM': 'system' 'SYSTEM': 'system',
}[item['role']] }[item['role'].upper()]
} }
history.append(msg) history.append(msg)
user_content = history[-1]['content'] user_content = history[-1]['content']
...@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace): ...@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
response, history = self.model.chat(self.tokenizer, response, history = self.model.chat(self.tokenizer,
user_content, user_content,
history=history) history=history)
# response will be dict sometime
if isinstance(response, dict):
response = response.get('content', '')
responses.append(response) responses.append(response)
except Exception: except Exception:
responses.append('') responses.append('')
......
...@@ -52,7 +52,7 @@ class LagentAgent: ...@@ -52,7 +52,7 @@ class LagentAgent:
def chat(self, def chat(self,
user_input: str, user_input: str,
history: List[dict] = None) -> Tuple[str, List[dict]]: history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
"""Chat with agent.""" """Chat with agent."""
if history: if history:
self.agent._session_history = history self.agent._session_history = history
...@@ -60,6 +60,7 @@ class LagentAgent: ...@@ -60,6 +60,7 @@ class LagentAgent:
from lagent.schema import ActionReturn, AgentReturn from lagent.schema import ActionReturn, AgentReturn
generation: AgentReturn = self.agent.chat(user_input) generation: AgentReturn = self.agent.chat(user_input)
inner_steps = generation.inner_steps
answer = generation.response answer = generation.response
steps = [] steps = []
...@@ -76,7 +77,7 @@ class LagentAgent: ...@@ -76,7 +77,7 @@ class LagentAgent:
valid=int(step.valid), valid=int(step.valid),
)) ))
return answer, steps return answer, steps, inner_steps
FORCE_STOP_PROMPT_EN = ( FORCE_STOP_PROMPT_EN = (
......
...@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel): ...@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
dialog = [] dialog = []
for item in input: for item in input:
msg = {'content': item['prompt']} msg = {'content': item['prompt']}
if item['role'] == 'HUMAN': if item['role'].upper() == 'HUMAN':
msg['role'] = 'user' msg['role'] = 'user'
elif item['role'] == 'BOT': elif item['role'].upper() == 'BOT':
msg['role'] = 'assistant' msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM': elif item['role'].upper() == 'SYSTEM':
msg['role'] = 'system' msg['role'] = 'system'
else:
raise ValueError(f'Unknown role: {item["role"]}')
dialog.append(msg) dialog.append(msg)
dialogs.append(dialog) dialogs.append(dialog)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment