Unverified Commit e78857ac authored by Hubert's avatar Hubert Committed by GitHub
Browse files

[Sync] minor test (#683)

parent dd4318f6
......@@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<eoa>',
)
]
......@@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]
......@@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]
......@@ -22,12 +22,14 @@ models = [
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
use_fast=False,
),
pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]
......@@ -22,12 +22,14 @@ models = [
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
use_fast=False,
),
pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
mathbench_summary_groups = [
{
'name': 'mathbench-college',
'subsets': [
['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-cloze_en', 'accuracy'],
]
},
{
'name': 'mathbench-high',
'subsets': [
['mathbench-high-single_choice_cn', 'acc_1'],
['mathbench-high-single_choice_en', 'acc_1'],
]
},
{
'name': 'mathbench-middle',
'subsets': [
['mathbench-middle-single_choice_cn', 'acc_1'],
]
},
{
'name': 'mathbench-primary',
'subsets': [
['mathbench-primary-cloze_cn', 'accuracy'],
]
},
{
'name': 'mathbench',
'subsets': [
'mathbench-college',
'mathbench-high',
'mathbench-middle',
'mathbench-primary',
],
},
{
'name': 'mathbench-college-circular',
'subsets': [
['mathbench-college-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-high-circular',
'subsets': [
['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'],
]
},
{
'name': 'mathbench-middle-circular',
'subsets': [
['mathbench-middle-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-circular',
'subsets': [
'mathbench-college-circular',
'mathbench-high-circular',
'mathbench-middle-circular',
],
},
{
'name': 'mathbench-circular-and-cloze',
'subsets': [
'mathbench-high-circular',
'mathbench-middle-circular',
'mathbench-circular',
'mathbench-college-cloze_en',
'mathbench-primary-cloze_cn',
],
}
]
summarizer = dict(
dataset_abbrs=[
'######## GSM8K-Agent Accuracy ########', # category
['gsm8k-agent', 'follow_acc'],
['gsm8k-agent', 'reasoning_acc'],
['gsm8k-agent', 'code_acc'],
['gsm8k-agent', 'action_pct'],
'######## MATH-Agent Accuracy ########', # category
['math-agent', 'follow_acc'],
['math-agent', 'reasoning_acc'],
['math-agent', 'code_acc'],
['math-agent', 'action_pct'],
'######## MathBench-Agent Accuracy ########', # category
['mathbench-college-single_choice_cn-agent', 'acc_1'],
['mathbench-college-cloze_en-agent', 'accuracy'],
['mathbench-high-single_choice_cn-agent', 'acc_1'],
['mathbench-high-single_choice_en-agent', 'acc_1'],
['mathbench-middle-single_choice_cn-agent', 'acc_1'],
['mathbench-primary-cloze_cn-agent', 'accuracy'],
'######## MathBench-Agent CircularEval ########', # category
['mathbench-college-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_en-agent', 'perf_4'],
['mathbench-middle-single_choice_cn-agent', 'perf_4'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
......@@ -2,13 +2,15 @@ import json
import os
import os.path as osp
import re
import subprocess
from collections import defaultdict
from typing import List, Optional
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
......@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
with open(file, 'r') as f:
notebook = json.load(f)
example = notebook['cells']
metadata = notebook['metadata']
modules = metadata.get('modules', [])
if modules:
# these two annotations should be the same
assert len(modules) == len(metadata.get('step_types'))
# reformat annotations
modules = [[_m.strip() for _m in _modules.split('&')]
for _modules in modules]
questions = []
source_codes = []
outputs = []
tags = []
for cell in example:
if cell['cell_type'] == 'markdown':
text = ''.join(cell['source'])
text = ''.join(cell['source']).strip()
if modules:
_modules = modules.pop(0)
text += f"Please use {' and '.join(_modules)} modules."
text = text.strip() + '\n'
# append the formatted text
questions.append(text)
elif cell['cell_type'] == 'code':
source_codes.append(''.join(cell['source']))
if cell['outputs'] and 'data' in cell['outputs'][-1]:
if 'image/png' in cell['outputs'][-1]['data']:
# skip vis temporarily due to lack of evaluation
......@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
outputs.append(''.join(
cell['outputs'][-1]['data']['text/plain']))
else:
tags.append('executable')
tags.append('exec')
outputs.append(None)
return dict(
experiment=file,
questions=sum(([
dict(role='user', content=question),
dict(role='assistant', content=output)
] for question, output in zip(questions, outputs)), []),
references=dict(outputs=outputs, tags=tags, experiment=file),
dict(role='assistant', content=source_code)
] for question, source_code in zip(questions, source_codes)), []),
references=dict(outputs=outputs,
tags=tags,
metadata=metadata,
experiment=file),
)
......@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
@staticmethod
def load(path: str):
"""Load whole dataset."""
assert os.path.exists(path), f'Path {path} does not exist.'
data_list = []
for cwd, dirs, files in os.walk(path):
dirs.sort()
......@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset.
Args:
text_evaluator (optional, dict): The text evaluator for text result
comparison[]. Defaults to None, which use Rouge as defaults.
Please notice that a extra key for `metric_name` should be set
to get the exact metric result, such as `rouge1`.
output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format.
with_ipynb (bool): Generate ipynb correspondingly.
Defaults to False.
user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir.
"""
def __init__(self,
text_evaluator: Optional[dict] = None,
output_dir: Optional[str] = None,
with_ipynb: bool = False,
user_data_dir: str = 'ENV') -> None:
if text_evaluator is None:
from opencompass.openicl.icl_evaluator import RougeEvaluator
self.text_evaluator = ICL_EVALUATORS.build(
dict(type=RougeEvaluator))
self.text_eval_metric = 'rouge1'
else:
self.text_eval_metric = text_evaluator.pop('metric_name')
self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
# TODO: should use work dir for this task.
self.output_dir = output_dir
self.user_data_dir = self.check_user_data_dir(user_data_dir)
self.with_ipynb = with_ipynb
self.TAG_MAPPING = {
'exec': ('executable', self.valid_step),
'general': ('general_correct', self.correct_step),
'num': ('numeric_correct', self.correct_step),
'text': ('text_score', self.text_step),
'vis': ('vis_sim', self.vis_similarity_step),
}
def check_user_data_dir(self, user_data_dir):
if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '')
self.user_data_dir = user_data_dir
user_data_dir = user_data_dir.rstrip('/')
basename = osp.basename(user_data_dir)
if basename and basename != 'data':
user_data_dir = osp.join(user_data_dir, 'data')
assert osp.exists(user_data_dir), \
f'a subfolder named `data` should exist under {user_data_dir}.'
elif basename:
assert osp.exists(user_data_dir), \
f'{user_data_dir} does not exist.'
return user_data_dir
@staticmethod
def valid_step(step):
......@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
# Fall back to False
return False
def text_step(self, step, target):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['result']:
try:
pred = action['result']['text']
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
if match:
out = match.group(1)
score = self.text_evaluator.score([out], [target])
return score[self.text_eval_metric] / 100
except Exception:
return False
# Fall back to False
return False
@staticmethod
def vis_similarity_step(step, target):
"""Whether the step output image has the same structure similarity with
......@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
'the conversion processes.')
check_jupytext()
p_list = []
from opencompass.lagent.actions.ipython_interpreter import extract_code
for idx, (example_origin_prompt,
example_steps) in enumerate(zip(origin_prompt, steps)):
......@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
f.writelines(markdown_lines)
# TODO: be careful for this
# The result might be different with infer process
# please check carefully
# convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen(
# "jupytext --to ipynb --pipe-fmt ipynb "
# "--pipe 'jupyter nbconvert --to ipynb --execute "
# f"--allow-errors --stdin --stdout' {md_file}",
# shell=True)
if self.with_ipynb:
p = subprocess.Popen(
'jupytext --to ipynb --pipe-fmt ipynb '
"--pipe 'jupyter nbconvert --to ipynb --execute "
f"--allow-errors --stdin --stdout' {md_file}",
shell=True)
p_list.append(p)
# TODO: async wait
for p in p_list:
p.wait()
def set_data_dir(self, work_dir):
"""Set work directory and link data files for save notebook results."""
if self.user_data_dir:
if self.user_data_dir.endswith('/'):
basename = osp.basename(osp.split(self.user_data_dir)[0])
else:
basename = osp.basename(self.user_data_dir)
basename = osp.basename(self.user_data_dir)
if not osp.exists(osp.join(self.output_dir, basename)):
os.symlink(self.user_data_dir,
osp.join(self.output_dir, basename))
......@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
"""Change work directory and keep the symlink."""
os.chdir(work_dir)
def single_exp(self, gold, steps):
tags = gold['tags']
outputs = gold['outputs']
metadata = gold['metadata']
hard_tags = metadata.get('step_types', [])
if hard_tags:
tags = hard_tags
# executable: exec succeed
# general_correct: general correct
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result = defaultdict(list)
for tag, step, output in zip(tags, steps, outputs):
# check whether this step is valid
result['executable'].append(self.valid_step(step))
if tag != 'exec':
key, func = self.TAG_MAPPING[tag]
result[key].append(func(step, output))
# add missing metric for better analyse if not exists
if hard_tags:
check_tags = ['exec', 'num', 'text', 'vis']
else:
check_tags = ['exec', 'general', 'vis']
for tag in check_tags:
key = self.TAG_MAPPING[tag][0]
if key not in result:
result[key] = []
return result
def get_output_dir(self):
"""Get output dir from eval task.
Notice: output dir should be in format xxx/data.
All the needed files should be
"""
# hard hack for get output dir from eval task
if hasattr(self, '_out_dir') and self.output_dir is None:
self.output_dir = self._out_dir
def score(self, predictions: List, references: List, steps: List,
origin_prompt: List):
"""Calculate accuracy."""
cwd = os.getcwd()
self.get_output_dir()
if self.output_dir:
if not osp.exists(self.output_dir):
os.makedirs(self.output_dir)
......@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
self.save_results(origin_prompt, steps)
self.unset_data_dir(cwd)
num_cells_list = []
num_general_list = []
passed_list = []
correct_list = []
vis_list = []
total_results = defaultdict(float)
total_scores = defaultdict(float)
total_nums = defaultdict(int)
for gold, single_steps in zip(references, steps):
tags = gold['tags']
outputs = gold['outputs']
num_cells = len(tags)
num_general = sum([tag == 'general' for tag in tags])
passed = sum([self.valid_step(step) for step in single_steps])
correct = 0
vis_sim = []
for tag, step, output in zip(tags, single_steps, outputs):
if tag == 'general':
correct += self.correct_step(step, output)
elif tag == 'vis':
vis_sim.append(self.vis_similarity_step(step, output))
num_cells_list.append(num_cells)
num_general_list.append(num_general)
passed_list.append(passed)
correct_list.append(correct)
if vis_sim:
vis_list.append(sum(vis_sim) / len(vis_sim))
else:
vis_list.append(-1)
result = self.single_exp(gold, single_steps)
if len([v for v in vis_list if v >= 0]) > 0:
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
[v for v in vis_list if v >= 0])
else:
# not valid
visualize_similarity = -1
for k, v in result.items():
total_scores[k] += sum(v)
total_nums[k] += len(v)
if sum(num_general_list) > 0:
general_accuracy = sum(correct_list) / sum(num_general_list)
else:
# not valid
general_accuracy = -1
result = dict(
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
general_accuracy=general_accuracy * 100,
visualize_similarity=visualize_similarity * 100,
num_cells_list=num_cells_list,
num_general_list=num_general_list,
passed_list=passed_list,
correct_list=correct_list,
vis_list=vis_list,
)
return result
for k, v in total_scores.items():
if total_nums[k] > 0:
total_results[k] = total_scores[k] / total_nums[k] * 100
else:
total_results[k] = -1
return total_results
......@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
if line['label'] == '-':
continue
data.append(line)
return Dataset.from_list(data)
......
......@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
return text
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
def ds1000_completion_postprocess(text: str) -> str:
text += '</code>'
match = re.search('(.*?)</code>', text, re.DOTALL)
if match:
text = match.group(1)
return text
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
def ds1000_matplotlib_postprocess(text: str) -> str:
text = ds1000_postprocess(text)
......
......@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) / total,
action_acc=100 * (action_scope + final_scope) / total,
action_pct=100 * (action_scope + final_scope) / total,
)
return result
......@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
data = []
with open(path, 'r') as infile:
with open(path, 'r', encoding='utf-8') as infile:
for id, line in enumerate(infile):
entry = json.loads(line)
if 'cloze' in name:
......
......@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
for line in f:
line = json.loads(line)
prompt = line['sentence']
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
line['answer']
})
continue_prompt = prompt.split('_')
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': line['answer'],
'cont': continue_prompt[1]
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
......@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
prompt = line['sentence']
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
answer
})
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': answer,
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
......@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20.
trim_output (int, optional): Max characters restriction of ipython
outputs. If None, do not perform any trim.
TODO: Notice that, this is not token len. Anf trim strategies
might be added later. Defaults to 1024.
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
......@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
enable: bool = True,
disable_description: Optional[str] = None,
timeout: int = 20,
trim_output: Optional[int] = 1024,
user_data_dir: str = 'ENV') -> None:
super().__init__(description, name, enable, disable_description)
......@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
user_data_dir = os.environ.get('USER_DATA_DIR', '')
if user_data_dir:
user_data_dir = os.path.dirname(user_data_dir)
# user_data_dir = os.path.dirname(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
self.user_data_dir = user_data_dir
self._initialized = False
self.trim_output = trim_output
if not os.path.exists(WORK_DIR):
os.mkdir(WORK_DIR)
......@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
if image:
result += f'\n\n{image}'
if finished:
# in case output text too long
# might need better design later
if self.trim_output and len(result) > self.trim_output:
ellip = '......'
half_len = int((self.trim_output - len(ellip)) / 2)
result = result[:half_len] + ellip + result[-half_len:]
return succeed, result
try:
......@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
command: str,
timeout: Optional[int] = None) -> ActionReturn:
tool_return = ActionReturn(url=None, args=None, type=self.name)
tool_return.args = dict(text=command)
succeed, result = self._call(command, timeout)
if succeed:
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
extracted_command = extract_code(command)
tool_return.args = dict(text=command, extract_code=extracted_command)
if extracted_command:
succeed, result = self._call(extracted_command, timeout)
if succeed:
if not result:
result = 'The code is succeed without any outputs.'
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
else:
tool_return.errmsg = repr(result)
tool_return.state = ActionStatusCode.API_ERROR
else:
tool_return.errmsg = repr(result)
tool_return.errmsg = 'The input code is empty. Please follow the format.' # noqa
tool_return.state = ActionStatusCode.API_ERROR
return tool_return
......
......@@ -115,6 +115,20 @@ class BaseModel:
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length)
def get_loglikelihood_from_template(self,
templates: List[PromptType],
conts: List[str],
mask_length=None):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_loglikelihood(inputs, conts, mask_length)
def generate_from_template(self, templates: List[PromptType],
max_out_len: int, **kwargs):
"""Generate completion from a list of templates.
......
import re
import sys
import threading
import time
import warnings
from abc import abstractmethod
from copy import deepcopy
from queue import Queue
from time import sleep
from typing import Dict, List, Optional, Tuple, Union
......@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
def __init__(self,
path: str,
query_per_second: int = 1,
rpm_verbose: bool = False,
retry: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
......@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
self.meta_template = meta_template
self.retry = retry
self.query_per_second = query_per_second
self.token_bucket = TokenBucket(query_per_second)
self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
self.template_parser = APITemplateParser(meta_template)
self.logger = get_logger()
self.generation_kwargs = generation_kwargs
......@@ -422,10 +425,13 @@ class TokenBucket:
query_per_second (float): The rate of the token bucket.
"""
def __init__(self, rate):
def __init__(self, rate, verbose=False):
self._rate = rate
self._tokens = threading.Semaphore(0)
self.started = False
self._request_queue = Queue()
self.logger = get_logger()
self.verbose = verbose
def _add_tokens(self):
"""Add tokens to the bucket."""
......@@ -440,3 +446,12 @@ class TokenBucket:
self.started = True
threading.Thread(target=self._add_tokens, daemon=True).start()
self._tokens.acquire()
if self.verbose:
cur_time = time.time()
while not self._request_queue.empty():
if cur_time - self._request_queue.queue[0] > 60:
self._request_queue.get()
else:
break
self._request_queue.put(cur_time)
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
......@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
import numpy as np
import torch
import transformers
from opencompass.models.base import BaseModel
from opencompass.models.base_api import APITemplateParser
......@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
batch_size: int,
):
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence,
add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# compare the last len(stop) tokens
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if done:
continue
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
@MODELS.register_module()
class HuggingFace(BaseModel):
"""Model wrapper around HuggingFace models.
......@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
self.model.config.eos_token_id = 2
self.model.config.pad_token_id = self.tokenizer.pad_token_id
def generate(self, inputs: List[str], max_out_len: int,
def generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
......@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
max_out_len=max_out_len,
**generation_kwargs)
else:
return sum((self._single_generate(
inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
for input_ in inputs), [])
return sum(
(self._single_generate(inputs=[input_],
max_out_len=max_out_len,
stopping_criteria=stopping_criteria,
**generation_kwargs)
for input_ in inputs), [])
def _batch_generate(self, inputs: List[str], max_out_len: int,
**kwargs) -> List[str]:
......@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
decodeds = [token.split(self.end_str)[0] for token in decodeds]
return decodeds
def _single_generate(self, inputs: List[str], max_out_len: int,
def _single_generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Support for single prompt inference.
......@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
max_length=self.max_seq_len -
max_out_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
if stopping_criteria:
# Construct huggingface stopping criteria
stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
stopping_criteria = transformers.StoppingCriteriaList([
*[
MultiTokenEOSCriteria(sequence, self.tokenizer,
input_ids.shape[0])
for sequence in stopping_criteria
],
])
kwargs['stopping_criteria'] = stopping_criteria
# To accommodate the PeftModel, parameters should be passed in
# key-value format for generate.
outputs = self.model.generate(input_ids=input_ids,
......@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
return ce_loss
def get_loglikelihood(
self,
inputs: List[str],
conts: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get loglikelihood scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
conts (List[str]): A list of strings: slices after the space.
NOT SUPPORT mask_length YET!
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of loglikelihood scores.
"""
assert mask_length is None, 'Not support mask_length yet.'
if self.batch_padding and len(inputs) > 1:
raise NotImplementedError('Batch padding is not supported yet.')
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return np.array([
self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
for idx in range(len(inputs))
])
def _get_loglikelihood(self, inputs: str, conts: str) -> float:
"""Get loglikelihood scores given input string and continuation string.
Args:
inputs (str): string.
conts (str): strings: slices after the space.
Returns:
float: loglikelihood scores.
"""
input_ids = self.tokenizer(inputs,
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
context_ids = self.tokenizer(inputs.replace(conts, ''),
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
cont_ids = input_ids[len(context_ids):]
output = self.model(input_ids.unsqueeze(0))
logits = output['logits'][:, :-1]
logits = torch.nn.functional.log_softmax(logits, dim=-1)
contlen = cont_ids.shape[0]
logits = logits[:, -contlen:, :]
# Reducing the dimension will lead to a wrong outcome
logits_gather = torch.gather(
logits, 2,
cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq]
# Answer: sum the likelihood of each token in continuation
answer = float(logits_gather.detach().cpu().sum())
return answer
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
......@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
'role': {
'HUMAN': 'user',
'BOT': 'assistant',
'SYSTEM': 'system'
}[item['role']]
'SYSTEM': 'system',
}[item['role'].upper()]
}
history.append(msg)
user_content = history[-1]['content']
......@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
response, history = self.model.chat(self.tokenizer,
user_content,
history=history)
# response will be dict sometime
if isinstance(response, dict):
response = response.get('content', '')
responses.append(response)
except Exception:
responses.append('')
......
......@@ -52,7 +52,7 @@ class LagentAgent:
def chat(self,
user_input: str,
history: List[dict] = None) -> Tuple[str, List[dict]]:
history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
"""Chat with agent."""
if history:
self.agent._session_history = history
......@@ -60,6 +60,7 @@ class LagentAgent:
from lagent.schema import ActionReturn, AgentReturn
generation: AgentReturn = self.agent.chat(user_input)
inner_steps = generation.inner_steps
answer = generation.response
steps = []
......@@ -76,7 +77,7 @@ class LagentAgent:
valid=int(step.valid),
))
return answer, steps
return answer, steps, inner_steps
FORCE_STOP_PROMPT_EN = (
......
......@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
dialog = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
if item['role'].upper() == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
elif item['role'].upper() == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
elif item['role'].upper() == 'SYSTEM':
msg['role'] = 'system'
else:
raise ValueError(f'Unknown role: {item["role"]}')
dialog.append(msg)
dialogs.append(dialog)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment