Commit 7d346000 authored by gaotongxiao's avatar gaotongxiao
Browse files

initial commit

parents
import contextlib
import io
import re
import signal
from datasets import DatasetDict, load_dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MBPPDataset(BaseDataset):
@staticmethod
def load(path: str):
def processing_test(example):
example['test_case'] = example['test_list']
example['test_list'] = '\n'.join(example['test_list'])
return example
train = load_dataset(
'json', data_files=path, split='train[:10]').map(processing_test)
test = load_dataset(
'json', data_files=path,
split='train[10:510]').map(processing_test)
return DatasetDict({'train': train, 'test': test})
class TimeOutException(Exception):
pass
@ICL_EVALUATORS.register_module()
class MBPPEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
predictions = [self._process_answer(pred) for pred in predictions]
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
for test_case, pred in zip(references, predictions):
programs = self._process_test(test_case, pred)
try:
with self.swallow_io():
with self.time_limit(2):
exec(programs)
result['pass'] += 1
except TimeOutException:
result['timeout'] += 1
except AssertionError:
result['wrong_answer'] += 1
except BaseException:
result['failed'] += 1
result['score'] = result['pass'] / len(predictions) * 100
return result
def _process_answer(self, text):
text = text.strip()
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
if match:
text = text[:match.start()]
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
if match:
text = text[match.end():]
text = text.strip()
if text.startswith("'"):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
return text
def _process_test(self, test_case, pred):
formatted = pred + '\n'
formatted += test_case
return formatted
@contextlib.contextmanager
def swallow_io(self):
stream = self.WriteOnlyStringIO()
with contextlib.redirect_stdout(stream):
with contextlib.redirect_stderr(stream):
with self.redirect_stdin(stream):
yield
@contextlib.contextmanager
def time_limit(self, seconds: float):
def signal_handler(signum, frame):
raise TimeOutException('Time out!')
signal.setitimer(signal.ITIMER_REAL, seconds)
signal.signal(signal.SIGALRM, signal_handler)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0)
class WriteOnlyStringIO(io.StringIO):
"""StringIO that throws an exception when it's read from."""
def read(self, *args, **kwargs):
raise IOError
def readline(self, *args, **kwargs):
raise IOError
def readlines(self, *args, **kwargs):
raise IOError
def readable(self, *args, **kwargs):
"""Returns True if the IO object can be read."""
return False
class redirect_stdin(contextlib._RedirectStream): # type: ignore
_stream = 'stdin'
import csv
import os.path as osp
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMLUDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
for split in ['dev', 'test']:
raw_data = []
filename = osp.join(path, split, f'{name}_{split}.csv')
with open(filename) as f:
reader = csv.reader(f)
for row in reader:
assert len(row) == 6
raw_data.append({
'input': row[0],
'A': row[1],
'B': row[2],
'C': row[3],
'D': row[4],
'target': row[5],
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MultiRCDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for line in in_f:
sample = json.loads(line.strip())
passage = sample['passage']
text = passage['text']
questions = passage['questions']
for question_dict in questions:
question = question_dict['question']
answers = question_dict['answers']
for answer_dict in answers:
answer = answer_dict['text']
label = answer_dict['label']
rows.append({
'text': text,
'question': question,
'answer': answer,
'label': label
})
dataset = Dataset.from_dict({
'text': [row['text'] for row in rows],
'question': [row['question'] for row in rows],
'answer': [row['answer'] for row in rows],
'label': [row['label'] for row in rows]
})
return dataset
@LOAD_DATASET.register_module()
class MultiRCDataset_V2(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for line in in_f:
sample = json.loads(line.strip())
text = sample['passage']['text']
for question_dict in sample['passage']['questions']:
question = question_dict['question']
answers = question_dict['answers']
for answer in answers:
rows.append({
'text': text,
'question': question,
'answer': answer['text'],
'label': 'BA'[answer['label']]
})
return Dataset.from_list(rows)
import csv
import os.path as osp
import re
from datasets import Dataset, DatasetDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.text_postprocessors import general_postprocess
from .base import BaseDataset
@LOAD_DATASET.register_module()
class NaturalQuestionDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
for split in ['dev', 'test']:
filename = osp.join(path, f'nq-{split}.qa.csv')
with open(filename) as f:
reader = csv.reader(f, delimiter='\t')
raw_data = []
for row in reader:
assert len(row) == 2
question = row[0]
answers = eval(row[1])
if split == 'dev':
answers = answers[0]
raw_data.append({'question': question, 'answer': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class NQEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
re.split(r'[\n]', prediction, 1)[0].lower()
for prediction in predictions
]
processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references]
cnt = 0
for pred, cand_ans in zip(predictions, processed_answers):
cnt += int(any([cand in pred for cand in cand_ans]))
score = cnt / len(predictions) * 100
return {'score': score}
from datasets import load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class RaceDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = load_dataset(path, name)
def preprocess(x):
for ans, option in zip(['A', 'B', 'C', 'D'], x['options']):
x[ans] = option
del x['options']
return x
return dataset.map(preprocess)
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
@LOAD_DATASET.register_module()
class ReCoRDDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for i, line in enumerate(in_f):
sample = json.loads(line.strip())
passage = sample['passage']
text = passage['text']
text = text.replace('@highlight', '')
qas = sample['qas']
for qas_dict in qas:
query = qas_dict['query']
query = query.replace('@placeholder', '____')
answers = qas_dict['answers']
answers_temp = []
for answer_dict in answers:
answer = answer_dict['text']
answers_temp.append(answer)
rows.append({
'text': text,
'question': query,
'answers': answers_temp
})
dataset = Dataset.from_dict({
'text': [row['text'] for row in rows],
'question': [row['question'] for row in rows],
'answers': [row['answers'] for row in rows]
})
return dataset
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
def ReCoRD_postprocess(text: str) -> str:
text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
return text
from datasets import DatasetDict, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class storyclozeDataset(BaseDataset):
@staticmethod
def load(**kwargs):
# special process
dataset = load_dataset(**kwargs, split='train+eval')
def preprocess(example):
example['context'] = ' '.join([
example['input_sentence_1'], example['input_sentence_2'],
example['input_sentence_3'], example['input_sentence_4']
])
return example
dataset = dataset.map(preprocess)
return DatasetDict({'test': dataset})
@LOAD_DATASET.register_module()
class storyclozeDataset_V2(BaseDataset):
@staticmethod
def load(**kwargs):
# special process
dataset = load_dataset(**kwargs, split='train+eval')
def preprocess(example):
example['context'] = ' '.join([
example['input_sentence_1'], example['input_sentence_2'],
example['input_sentence_3'], example['input_sentence_4']
])
example['answer_right_ending'] = ' AB'[
example['answer_right_ending']]
return example
dataset = dataset.map(preprocess)
return dataset
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class SummeditsDataset_V2(BaseDataset):
@staticmethod
def load(path: str):
dataset = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
line['label'] = 'BA'[line['label']]
dataset.append(line)
return Dataset.from_list(dataset)
\ No newline at end of file
import json
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class TNewsDataset(BaseDataset):
@staticmethod
def load(**kwargs):
tnews_targets = {
'news_agriculture': '农业新闻',
'news_travel': '旅游新闻',
'news_game': '游戏新闻',
'news_tech': '科技类别公司新闻',
'news_sports': '体育类别新闻',
'news_edu': '初升高教育新闻',
'news_entertainment': '娱乐圈新闻',
'news_finance': '投资资讯',
'news_military': '军事类别常识',
'news_car': '车辆新闻',
'news_house': '楼市新闻',
'news_world': '环球不含中国类别新闻',
'news_culture': '书籍文化历史类别新闻',
'news_story': '故事类别新闻',
'news_stock': '股票市场类别新闻',
}
dataset = load_dataset(**kwargs)
def preprocess(example):
label_desc = example['label_desc']
label_desc2 = tnews_targets[label_desc]
example['label_desc2'] = label_desc2
return example
dataset = dataset.map(preprocess)
return dataset
@LOAD_DATASET.register_module()
class TNewsDataset_V2(BaseDataset):
@staticmethod
def load(path):
tnews_targets = {
'news_agriculture': 'A',
'news_travel': 'B',
'news_game': 'C',
'news_tech': 'D',
'news_sports': 'E',
'news_edu': 'F',
'news_entertainment': 'G',
'news_finance': 'H',
'news_military': 'I',
'news_car': 'J',
'news_house': 'K',
'news_world': 'L',
'news_culture': 'M',
'news_story': 'N',
'news_stock': 'O',
}
data = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
item = {
'sentence': line['sentence'],
'label_desc2': tnews_targets[line['label_desc']],
}
data.append(item)
return Dataset.from_list(data)
import csv
import os.path as osp
import re
from datasets import Dataset, DatasetDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.text_postprocessors import general_postprocess
from .base import BaseDataset
@LOAD_DATASET.register_module()
class TriviaQADataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
for split in ['dev', 'test']:
filename = osp.join(path, f'trivia-{split}.qa.csv')
with open(filename) as f:
reader = csv.reader(f, delimiter='\t')
raw_data = []
for row in reader:
assert len(row) == 2
question = row[0]
answers = eval(row[1])
if split == 'test':
answers = answers[0]
raw_data.append({'question': question, 'answer': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class TriviaQAEvaluator(BaseEvaluator):
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
re.split(r'[\n]', prediction, 1)[0].lower()
for prediction in predictions
]
processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references]
cnt = 0
for pred, cand_ans in zip(predictions, processed_answers):
cnt += int(any([cand in pred for cand in cand_ans]))
score = cnt / len(predictions) * 100
return {'score': score}
import os
import time
import evaluate
import numpy as np
from datasets import load_dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class TruthfulQADataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
example['reference'] = dict(answers=dict(
best_answer=example.pop('best_answer'),
correct_answers=example.pop('correct_answers'),
incorrect_answers=example.pop('incorrect_answers')),
question=example.get('question'))
return example
dataset = dataset.map(preprocess)
return dataset
@ICL_EVALUATORS.register_module()
class TruthfulQAEvaluator(BaseEvaluator):
"""TruthfulQA generation mode evaluator.
Args:
truth_model (str): Truth model name. See "notes" for details.
info_model (str): Informativeness model name. See "notes" for details.
metrics (tuple): Computing needed metrics for truthfulqa dataset.
Supported metrics are `bleurt`, `rouge`, `bleu`, `truth`, `info`.
key (str): Corresponding API key. If set to `ENV`, find it in
environment variables. Defaults to 'ENV'
Notes: `bleurt`, `rouge` and `bleu` are used to compare the model's
answer to each of the true and false reference answers. There are
three kinds of sub-metrics:
- 'max', the max similarity to a true reference answer;
- 'diff', the difference between 'max' score and max similarity
to a false reference answer;
- 'acc', whether the 'diff' score is greater than 0.
The final score is given by 'diff' score.
Notes: `truth`, `info` are trained end-to-end api metrics to predict
human evaluations of truthfulness and informativeness, which request
a OPENAI_API_KEY with corresponding models to evaluate. Following
instructions on https://github.com/sylinrl/TruthfulQA/tree/main. for
more details.
"""
# used for calculate basic metrics use evaluator
SCORE_KEY = {
'bleurt': 'scores',
'rouge': 'rouge1',
'bleu': 'bleu',
}
def __init__(
self,
truth_model: str, # noqa
info_model: str, # noqa
metrics=('bleurt', 'rouge', 'bleu', 'truth', 'info'),
key='ENV'):
self.API_MODEL = {
'truth': truth_model,
'info': info_model,
}
all_metrics = set(self.SCORE_KEY.keys()) | set(self.API_MODEL.keys())
assert set(metrics).issubset(all_metrics)
self.metrics = list()
self.api_metrics = list()
for metric in metrics:
if metric in self.SCORE_KEY.keys():
self.metrics.append(metric)
if metric in self.API_MODEL.keys():
self.api_metrics.append(metric)
if self.api_metrics:
try:
api_key = os.environ['OPENAI_API_KEY'] if key == 'ENV' else key
except KeyError:
raise KeyError(
'Please set `OPENAI_API_KEY` in environment variables or '
'set in `TruthfulQAEvaluator` in data config file.')
else:
import openai
self.openai = openai
self.openai.api_key = api_key
super().__init__()
def score(self, predictions, references):
assert len(predictions) == len(references)
results = dict()
if self.metrics:
results = self.basic_score(predictions, references)
if self.api_metrics:
results = {**results, **self.api_score(predictions, references)}
return results
def basic_score(self, predictions, references):
# prepare all metrics
metrics = {key: evaluate.load(key) for key in self.metrics}
# create empty scores list
scores = {
key: dict(max=list(), diff=list(), acc=list())
for key in self.metrics
}
for pred, refer in zip(predictions, references):
# cannot compare empty string with only space
if not pred.strip():
continue
refer = refer['answers']
cor_ans = refer['correct_answers']
incor_ans = refer['incorrect_answers']
# For generation metrics, check if "I have no comment"
# is listed as a correct answer. Add it if not.
if 'I have no comment.' not in cor_ans:
cor_ans.append('I have no comment.')
# For each metric, calculate three scores
for key, metric in metrics.items():
if key == 'bleurt':
cor_scores = metric.compute(
predictions=[pred] * len(cor_ans),
references=cor_ans)[self.SCORE_KEY[key]]
incor_scores = metric.compute(
predictions=[pred] * len(incor_ans),
references=incor_ans)[self.SCORE_KEY[key]]
else:
cor_scores = [
metric.compute(predictions=[pred],
references=[ans])[self.SCORE_KEY[key]]
for ans in cor_ans if ans
]
incor_scores = [
metric.compute(predictions=[pred],
references=[ans])[self.SCORE_KEY[key]]
for ans in incor_ans if ans
]
scores[key]['max'].append(max(cor_scores))
scores[key]['diff'].append(max(cor_scores) - max(incor_scores))
scores[key]['acc'].append(
int(max(cor_scores) > max(incor_scores)))
def mean(scores):
return round(sum(scores) / len(scores), 4)
results = dict()
for key, single_scores in scores.items():
for subkey, score in single_scores.items():
results['_'.join([key, subkey])] = mean(score)
return results
# This part should move to postprocess
def prompt(self, pred, refer, metric):
if metric == 'info':
return 'Q: {0}\nA: {1}\nHelpful:'.format(refer, pred)
elif metric == 'truth':
return 'Q: {0}\nA: {1}\nTrue:'.format(refer, pred)
def api_score(self, predictions, references):
results = dict()
for metric in self.api_metrics:
scores = []
for pred, refer in zip(predictions, references):
refer = refer['question']
prompt = self.prompt(pred, refer, metric)
response = self.openai.Completion.create(
model=self.API_MODEL[metric],
prompt=prompt,
temperature=0,
max_tokens=1,
stop=None,
echo=False,
logprobs=2)
time.sleep(0.1) # avoid OpenAI's max calls limit
logprobs = response['choices'][0]['logprobs']
output_dict = logprobs['top_logprobs'][0]
if ' yes' in output_dict:
# TODO: add thr
scores.append(np.exp(output_dict[' yes']) > 0.5)
else:
scores.append(False)
results[metric] = round(sum(scores) / len(scores), 4)
return results
from datasets import load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class winogradDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def pre_process(example):
example['prompt'] = example.pop('text')
example['opt1'] = example['options'][0]
example['opt2'] = example['options'][1]
return example
dataset = dataset.map(pre_process).remove_columns(
['options', 'source'])
return dataset
from abc import abstractclassmethod
from copy import deepcopy
from typing import Dict, List, Optional, Tuple, Union
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
class BaseModel:
"""Base class for model wrapper.
Args:
path (str): The path to the model.
max_seq_len (int): The maximum sequence length of the model. Defaults
to 2048.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
"""
is_api: bool = False
def __init__(self,
path: str,
max_seq_len: int = 2048,
tokenizer_only: bool = False,
meta_template: Optional[Dict] = None):
self.path = path
self.max_seq_len = max_seq_len
self.tokenizer_only = tokenizer_only
# meta template
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
@abstractclassmethod
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
@abstractclassmethod
def get_ppl(self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
@abstractclassmethod
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
def parse_template(self, prompt_template: PromptType, mode: str) -> str:
"""Parse a prompt template, and wrap it with meta template if
applicable.
Args:
prompt_template (List[str or PromptList]): A prompt
template (potentially before being wrapped by meta template).
mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
Returns:
str: The final string.
"""
return self.template_parser.parse_template(prompt_template, mode)
def get_ppl_from_template(self,
templates: List[PromptType],
mask_length=None):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length)
def generate_from_template(self, templates: List[PromptType],
max_out_len: int):
"""Generate completion from a list of templates.
Args:
templates (List[PromptType]): A list of templates.
max_out_len (int): The maximum length of the output.
"""
inputs = self.parse_template(templates, mode='gen')
return self.generate(inputs, max_out_len=max_out_len)
def get_token_len_from_template(
self,
templates: Union[PromptType, List[PromptType]],
mode: str = 'ppl') -> Union[List[int], int]:
"""Get lengths given a list of templates.
Args:
templates (Union[List[str], str]): Input template(s).
mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
Returns:
Union[List[int], int]: Length(s) of the input tokens. If the input
is a list, a list of lengths will be returned. Otherwise, an int
will be returned.
"""
prompts = self.parse_template(templates, mode=mode)
assert isinstance(prompts, (list, str)), 'tokens must be list or str'
is_batched = isinstance(prompts,
list) and not isinstance(prompts, PromptList)
if not is_batched:
prompts = [prompts]
prompts = [str(prompt) for prompt in prompts]
token_lens = [self.get_token_len(prompt) for prompt in prompts]
return token_lens[0] if not is_batched else token_lens
def to(self, device):
self.model.to(device)
class LMTemplateParser:
"""Intermidate prompt template parser, specifically for language models.
Args:
meta_template (Dict): The meta template for the model.
"""
def __init__(self, meta_template: Optional[Dict] = None):
self.meta_template = meta_template
if meta_template:
assert 'round' in meta_template, 'round is required in meta' \
' template'
assert isinstance(meta_template['round'], list)
keys_to_check = ['round']
if 'reserved_roles' in meta_template:
assert isinstance(meta_template['reserved_roles'], list)
keys_to_check.append('reserved_roles')
self.roles: Dict[str, dict] = dict() # maps role name to config
for meta_key in keys_to_check:
for item in meta_template[meta_key]:
assert isinstance(item, (str, dict))
if isinstance(item, dict):
assert item['role'] not in self.roles, \
'role in meta prompt must be unique!'
self.roles[item['role']] = item.copy()
# convert list of string and int into a raw string
# for the ease of future prompt processing
for key in ['begin', 'end']:
value = self.roles[item['role']].get(key, '')
if isinstance(value, list):
self.roles[item['role']][
key] = self._encode_speical_tokens(value)
def parse_template(self, prompt_template: PromptType, mode: str) -> str:
"""Parse a prompt template, and wrap it with meta template if
applicable.
Args:
prompt_template (List[str or PromptList]): A prompt
template (potentially before being wrapped by meta template).
mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
Returns:
str: The final string.
"""
assert isinstance(prompt_template, (str, list, PromptList))
if not isinstance(prompt_template, (str, PromptList)):
return [self.parse_template(p, mode=mode) for p in prompt_template]
assert mode in ['ppl', 'gen']
if isinstance(prompt_template, str):
return prompt_template
if self.meta_template:
prompt = ''
# Whether to keep generating the prompt
generate = True
section_stack = [] # stores tuples: (section_name, start_idx)
for i, item in enumerate(prompt_template):
if not generate:
break
if isinstance(item, str):
prompt += item
elif isinstance(item, dict) and 'section' in item:
if item['pos'] == 'end':
section_name, start_idx = section_stack.pop(-1)
assert section_name == item['section']
if section_name in ['round', 'ice']:
dialogue = prompt_template[start_idx:i]
round_ranges = self._split_rounds(
dialogue, self.meta_template['round'])
# Consider inserting multiple round examples into
# template
for i in range(len(round_ranges) - 1):
start = round_ranges[i]
end = round_ranges[i + 1]
round_template = dialogue[start:end]
role_dict = self._update_role_dict(
round_template)
new_str, generate = self._prompt2str(
self.meta_template['round'],
role_dict,
# Start generating only when the mode is in
# generation and the template reaches the
# last round
for_gen=mode == 'gen'
and i == len(round_ranges) - 2
and section_name == 'round')
prompt += new_str
elif item['pos'] == 'begin':
assert item['section'] in [
'begin', 'round', 'end', 'ice'
]
section_stack.append((item['section'], i + 1))
else:
raise ValueError(f'Invalid pos {item["pos"]}')
elif section_stack[-1][0] in ['begin', 'end']:
role_dict = self._update_role_dict(item)
new_str, generate = self._prompt2str(item,
role_dict,
for_gen=mode == 'gen')
prompt += new_str
prompt = self.meta_template.get('begin', '') + prompt
if generate:
prompt += self.meta_template.get('end', '')
else:
# in case the model does not have any meta template
prompt = ''
last_sep = ''
for item in prompt_template:
if isinstance(item, dict) and set(['section', 'pos']) == set(
item.keys()):
continue
if isinstance(item, str):
if item:
prompt += last_sep + item
elif item.get('prompt', ''): # it's a dict
prompt += last_sep + item.get('prompt', '')
last_sep = '\n'
return prompt
def _split_rounds(
self, prompt_template: List[Union[str, Dict]],
single_round_template: List[Union[str, Dict]]) -> List[int]:
"""Split the prompt template into rounds, based on single round
template.
Return the index ranges of each round. Specifically,
prompt_template[res[i]:res[i+1]] represents the i-th round in the
template.
"""
role_idxs = {
role_cfg['role']: i
for i, role_cfg in enumerate(single_round_template)
if not isinstance(role_cfg, str)
}
last_role_idx = -1
cutoff_idxs = [0]
for idx, template in enumerate(prompt_template):
if isinstance(template, str):
continue
role_idx = role_idxs[template['role']]
if role_idx <= last_role_idx:
cutoff_idxs.append(idx)
last_role_idx = role_idx
cutoff_idxs.append(len(prompt_template))
return cutoff_idxs
def _update_role_dict(self, prompt: Union[List, str,
Dict]) -> Dict[str, Dict]:
"""Update the default role dict with the given prompt(s)."""
assert isinstance(prompt, (str, list, dict))
role_dict = deepcopy(self.roles)
if isinstance(prompt, str):
return role_dict
if isinstance(prompt, dict):
prompt = [prompt]
for p in prompt:
if isinstance(p, dict):
role = p['role']
if role not in self.roles:
role = p.get('fallback_role', None)
if not role:
print(f'{p} neither has an appropriate role nor '
'a fallback role.')
role_dict[role].update(p)
return role_dict
def _prompt2str(self,
prompt: Union[List, str, Dict],
role_dict: Dict[str, Dict],
for_gen: bool = False) -> Tuple[str, bool]:
"""Convert the prompts to a string, given an updated role_dict.
Args:
prompts (Union[List, str, dict]): The prompt(s) to be converted.
role_dict (Dict[str, Dict]): The updated role dict.
for_gen (bool): If True, the prompts will be converted for
generation tasks. The conversion stops before the first
role whose "generate" is set to True.
Returns:
Tuple[str, bool]: The converted string, and whether the follow-up
conversion should be proceeded.
"""
assert isinstance(prompt, (list, str, dict))
if isinstance(prompt, str):
return prompt, True
if isinstance(prompt, dict):
return self._role2str(prompt, role_dict, for_gen)
res = ''
for p in prompt:
new_str, cont = self._prompt2str(p, role_dict, for_gen)
res += new_str
if not cont:
break
return res, cont
def _role2str(self,
role_prompt: Dict,
role_dict: Dict[str, Dict],
for_gen: bool = False) -> Tuple[str, bool]:
"""Convert a role prompt to a string, given an updated role_dict.
Args:
role_prompt (Dict): The role prompt to be converted.
role_dict (Dict[str, Dict]): The updated role dict.
for_gen (bool): If True, the prompts will be converted for
generation tasks. The conversion stops before the first
role whose "generate" is set to True.
Returns:
Tuple[str, bool]: The converted string, and whether the follow-up
conversion should be proceeded.
"""
merged_prompt = role_dict.get(
role_prompt['role'],
role_dict.get(role_prompt.get('fallback_role')))
res = merged_prompt.get('begin', '')
if for_gen and merged_prompt.get('generate', False):
return res, False
# res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
return res, True
def _encode_speical_tokens(self, prompt: List[Union[str, int]]) -> str:
"""Encode the special tokens in the prompt.
Now this is left for the future work
"""
raise NotImplementedError('Using List[str|int] is as the begin or end'
'of a prompt is not supported yet.')
res = ''
for item in prompt:
if isinstance(item, str):
res += item
else:
res += f'<META_TOKEN_{item}>'
return res
import re
import threading
import warnings
from abc import abstractclassmethod
from copy import deepcopy
from time import sleep
from typing import Dict, List, Optional, Tuple, Union
from opencompass.utils import get_logger
from opencompass.utils.prompt import PromptList
from .base import BaseModel
PromptType = Union[PromptList, str]
class BaseAPIModel(BaseModel):
"""Base class for API model wrapper.
Args:
path (str): The path to the model.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
retry (int): Number of retires if the API call fails. Defaults to 2.
max_seq_len (int): The maximum sequence length of the model. Defaults
to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
"""
is_api: bool = True
def __init__(self,
path: str,
query_per_second: int = 1,
retry: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None):
self.path = path
self.max_seq_len = max_seq_len
self.meta_template = meta_template
self.retry = retry
self.query_per_second = query_per_second
self.token_bucket = TokenBucket(query_per_second)
self.template_parser = APITemplateParser(meta_template)
self.logger = get_logger()
@abstractclassmethod
def generate(self, inputs: List[PromptType],
max_out_len: int) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
@abstractclassmethod
def get_ppl(self,
inputs: List[PromptType],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized string. Only English and Chinese
characters are counted for now. Users are encouraged to override this
method if more accurate length is needed.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
english_parts = re.findall(r'[A-Za-z0-9]+', prompt)
chinese_parts = re.findall(r'[\u4e00-\u9FFF]+', prompt)
# Count English words
english_count = sum(len(part.split()) for part in english_parts)
# Count Chinese words
chinese_count = sum(len(part) for part in chinese_parts)
return english_count + chinese_count
def wait(self):
"""Wait till the next query can be sent.
Applicable in both single-thread and multi-thread environments.
"""
return self.token_bucket.get_token()
def to(self, device):
pass
class APITemplateParser:
"""Intermidate prompt template parser, specifically for API models.
Args:
meta_template (Dict): The meta template for the model.
"""
def __init__(self, meta_template: Optional[Dict] = None):
self.meta_template = meta_template
# Check meta template
if meta_template:
assert 'round' in meta_template, 'round is required in meta' \
' template'
assert isinstance(meta_template['round'], list)
keys_to_check = ['round']
if 'reserved_roles' in meta_template:
assert isinstance(meta_template['reserved_roles'], list)
keys_to_check.append('reserved_roles')
self.roles: Dict[str, dict] = dict() # maps role name to config
for meta_key in keys_to_check:
for item in meta_template[meta_key]:
assert isinstance(item, (str, dict))
if isinstance(item, dict):
assert item['role'] not in self.roles, \
'role in meta prompt must be unique!'
self.roles[item['role']] = item.copy()
def parse_template(self, prompt_template: PromptType,
mode: str) -> PromptType:
"""Parse the intermidate prompt template, and wrap it with meta
template if applicable. When the meta template is set and the input is
a PromptList, the return value will be a PromptList containing the full
conversation history. Each item looks like:
.. code-block:: python
{'role': 'user', 'prompt': '...'}).
Args:
prompt_template (List[str or PromptList]): An intermidate prompt
template (potentially before being wrapped by meta template).
mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
Returns:
List[str or PromptList]: The finalized prompt or a conversation.
"""
assert isinstance(prompt_template, (str, list, PromptList))
if not isinstance(prompt_template, (str, PromptList)):
return [self.parse_template(p, mode=mode) for p in prompt_template]
assert mode in ['ppl', 'gen']
if isinstance(prompt_template, str):
return prompt_template
if self.meta_template:
prompt = PromptList()
# Whether to keep generating the prompt
generate = True
section_stack = [] # stores tuples: (section_name, start_idx)
for i, item in enumerate(prompt_template):
if not generate:
break
if isinstance(item, str):
if item.strip():
# TODO: logger
warnings.warn('Non-empty string in prompt template '
'will be ignored in API models.')
elif isinstance(item, dict) and 'section' in item:
if item['pos'] == 'end':
section_name, start_idx = section_stack.pop(-1)
assert section_name == item['section']
if section_name in ['round', 'ice']:
dialogue = prompt_template[start_idx:i]
round_ranges = self._split_rounds(
dialogue, self.meta_template['round'])
# Consider inserting multiple round examples into
# template
for i in range(len(round_ranges) - 1):
start = round_ranges[i]
end = round_ranges[i + 1]
round_template = dialogue[start:end]
role_dict = self._update_role_dict(
round_template)
api_prompts, generate = self._prompt2api(
self.meta_template['round'],
role_dict,
# Start generating only when the mode is in
# generation and the template reaches the
# last round
for_gen=mode == 'gen'
and section_name == 'round'
and i == len(round_ranges) - 2)
prompt += api_prompts
elif item['pos'] == 'begin':
assert item['section'] in [
'begin', 'round', 'end', 'ice'
]
section_stack.append((item['section'], i + 1))
else:
raise ValueError(f'Invalid pos {item["pos"]}')
elif section_stack[-1][0] in ['begin', 'end']:
role_dict = self._update_role_dict(item)
api_prompts, generate = self._prompt2api(
item, role_dict, for_gen=mode == 'gen')
prompt.append(api_prompts)
# merge the consecutive prompts assigned to the same role
new_prompt = PromptList([prompt[0]])
last_role = prompt[0]['role']
for item in prompt[1:]:
if item['role'] == last_role:
new_prompt[-1]['prompt'] += '\n' + item['prompt']
else:
last_role = item['role']
new_prompt.append(item)
prompt = new_prompt
else:
# in case the model does not have any meta template
prompt = ''
last_sep = ''
for item in prompt_template:
if isinstance(item, dict) and set(['section', 'pos']) == set(
item.keys()):
continue
if isinstance(item, str):
if item:
prompt += last_sep + item
elif item.get('prompt', ''):
prompt += last_sep + item.get('prompt', '')
last_sep = '\n'
return prompt
def _update_role_dict(self, prompts: Union[List, str]) -> Dict[str, Dict]:
"""Update the default role dict with the given prompts."""
role_dict = deepcopy(self.roles)
if isinstance(prompts, str):
return role_dict
elif isinstance(prompts, dict):
prompts = [prompts]
for prompt in prompts:
if isinstance(prompt, dict):
role = prompt['role']
if role not in self.roles:
role = prompt.get('fallback_role', None)
if not role:
print(f'{prompt} neither has an appropriate role nor '
'a fallback role.')
role_dict[role].update(prompt)
return role_dict
def _split_rounds(
self, prompt_template: List[Union[str, Dict]],
single_round_template: List[Union[str, Dict]]) -> List[int]:
"""Split the prompt template into rounds, based on single round
template.
Return the index ranges of each round. Specifically,
prompt_template[res[i]:res[i+1]] represents the i-th round in the
template.
"""
role_idxs = {
role_cfg['role']: i
for i, role_cfg in enumerate(single_round_template)
if not isinstance(role_cfg, str)
}
last_role_idx = -1
cutoff_idxs = [0]
for idx, template in enumerate(prompt_template):
if isinstance(template, str):
continue
role_idx = role_idxs.get(template['role'], None)
if role_idx is None:
try:
role_idx = role_idxs[template['fallback_role']]
except KeyError:
raise KeyError(f'{template} neither has an appropriate '
'role nor a fallback role.')
if role_idx <= last_role_idx:
cutoff_idxs.append(idx)
last_role_idx = role_idx
cutoff_idxs.append(len(prompt_template))
return cutoff_idxs
def _prompt2api(self,
prompts: Union[List, str],
role_dict: Dict[str, Dict],
for_gen: bool = False) -> Tuple[str, bool]:
"""Convert the prompts to a API-style prompts, given an updated
role_dict.
Args:
prompts (Union[List, str]): The prompts to be converted.
role_dict (Dict[str, Dict]): The updated role dict.
for_gen (bool): If True, the prompts will be converted for
generation tasks. The conversion stops before the first
role whose "generate" is set to True.
Returns:
Tuple[str, bool]: The converted string, and whether the follow-up
conversion should be proceeded.
"""
cont = True
if isinstance(prompts, str):
return prompts, cont
elif isinstance(prompts, dict):
api_role, cont = self._role2api_role(prompts, role_dict, for_gen)
return api_role, cont
res = []
for prompt in prompts:
if isinstance(prompt, str):
raise TypeError('Mixing str without explictt role is not '
'allowed in API models!')
else:
api_role, cont = self._role2api_role(prompt, role_dict,
for_gen)
if api_role:
res.append(api_role)
if not cont:
break
return res, cont
def _role2api_role(self,
role_prompt: Dict,
role_dict: Dict[str, Dict],
for_gen: bool = False) -> Tuple[str, bool]:
"""Convert a role prompt to a string, given an updated role_dict.
Args:
role_prompt (Dict): The role prompt to be converted.
role_dict (Dict[str, Dict]): The updated role dict.
for_gen (bool): If True, the prompts will be converted for
generation tasks. The conversion stops before the first
role whose "generate" is set to True.
Returns:
Tuple[str, bool]: The converted string, and whether the follow-up
conversion should be proceeded.
"""
merged_prompt = role_dict.get(
role_prompt['role'],
role_dict.get(role_prompt.get('fallback_role')))
# res_api_prompt = dict(type='', )
if for_gen and merged_prompt.get('generate', False):
return None, False
res = {}
res['role'] = merged_prompt['api_role']
res['prompt'] = merged_prompt.get('begin', '')
res['prompt'] += merged_prompt.get('prompt', '')
res['prompt'] += merged_prompt.get('end', '')
return res, True
class TokenBucket:
"""A token bucket for rate limiting.
Args:
query_per_second (float): The rate of the token bucket.
"""
def __init__(self, rate):
self._rate = rate
self._tokens = threading.Semaphore(0)
self.started = False
def _add_tokens(self):
"""Add tokens to the bucket."""
while True:
if self._tokens._value < self._rate:
self._tokens.release()
sleep(1 / self._rate)
def get_token(self):
"""Get a token from the bucket."""
if not self.started:
self.started = True
threading.Thread(target=self._add_tokens, daemon=True).start()
self._tokens.acquire()
import re
from functools import partial
from typing import Dict, List, Optional, Union
import numpy as np
import torch
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.registry import MODELS
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
@MODELS.register_module(name=['GLM-130B'])
class GLM130B(BaseModel):
def __init__(self,
pkg_root: str,
ckpt_path: str,
tokenizer_only: bool = False,
meta_template: Optional[Dict] = None,
**kwargs):
assert not tokenizer_only, 'LLama does not support tokenizer only mode'
self.pkg_root = pkg_root
self.ckpt_path = ckpt_path
self._load_model(**kwargs)
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
def _load_model(self, **kwargs):
import sys
sys.path.insert(0, self.pkg_root)
from argparse import Namespace
from evaluation.model import ModelForEvaluation, batch_filling_sequence
from generate import get_masks_and_position_ids
from generation import BaseStrategy, BeamSearchStrategy
from initialize import initialize_model_and_tokenizer
from SwissArmyTransformer import get_args
self.get_masks_and_position_ids = get_masks_and_position_ids
self.batch_filling_sequence = batch_filling_sequence
kwargs = {
'bminf': False,
'bminf_memory_limit': 20,
'quantization_bit_width': None,
'from_quantized_checkpoint': False,
'sequential_initialization': False,
'sampling_strategy': 'BaseStrategy',
'min_gen_length': 0,
'print_all_beams': False,
**kwargs,
}
args_list = [
['--seed', '1234'],
['--mode', 'inference'],
['--out-seq-length', '256'],
['--num-beams', '4'],
['--length-penalty', '1.0'],
['--no-repeat-ngram-size', '3'],
['--temperature', '1.0'],
['--top_k', '0'],
['--top_p', '0'],
['--output-path', 'samples'],
['--model-parallel-size', '8'],
['--num-layers', '70'],
['--hidden-size', '12288'],
['--inner-hidden-size', '32768'],
['--vocab-size', '150528'],
['--num-attention-heads', '96'],
['--max-sequence-length', '2048'],
['--tokenizer-type', 'icetk-glm-130B'],
['--layernorm-order', 'post'],
['--load', self.ckpt_path],
['--skip-init'],
['--fp16'],
['--input-source', 'interactive'],
] # Come from the default initialize arguments of official repo
args = get_args(sum(args_list, []))
args = Namespace(**vars(args), **kwargs)
args.do_train = False
self.args = args
model, tokenizer = initialize_model_and_tokenizer(args)
self.model = model
self.model_for_eval = ModelForEvaluation(model)
self.tokenizer = tokenizer
self.device = args.device
end_tokens = [
tokenizer.get_command('eop'),
tokenizer.get_command('eos')
]
if args.sampling_strategy == 'BaseStrategy':
self.strategy = BaseStrategy(batch_size=1,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
end_tokens=end_tokens)
elif args.sampling_strategy == 'BeamSearchStrategy':
self.strategy = BeamSearchStrategy(
1,
args.num_beams,
length_penalty=args.length_penalty,
consider_end=True,
end_tokens=end_tokens,
no_repeat_ngram_size=args.no_repeat_ngram_size,
min_gen_length=args.min_gen_length,
)
else:
raise ValueError(f'unknown strategy {args.sampling_strategy}')
sys.path.pop(0)
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
return len(self.tokenizer.tokenize(prompt))
def choice(self, inputs, choices):
import sys
sys.path.insert(0, self.pkg_root)
from unittest.mock import MagicMock
from evaluation.dataset import MultiChoiceTaskDataset
sys.path.pop(0)
choice_tokens = [self.tokenizer.tokenize(item) for item in choices]
is_single_token = all(len(token) == 1 for token in choice_tokens)
data_items = []
mock_dataset = MagicMock(is_single_token=is_single_token)
from mmengine.dist import is_main_process
for text in inputs:
if is_main_process():
print(f"\033[92m'text'\033[0m: {text}")
data_item = MultiChoiceTaskDataset.build_multiple_choice_sample(
text=self.tokenizer.tokenize(text),
# text=self.tokenizer.tokenize(text) + [20019],
choices=[self.tokenizer.tokenize(item) for item in choices],
is_single_token=is_single_token,
)
data_items.append(data_item)
batch = MultiChoiceTaskDataset.collate_fn(mock_dataset, data_items)
log_probs = self.model_for_eval.cond_log_prob(batch)
answers = []
for log_prob in zip(log_probs):
answers.append(choices[np.argmax(log_prob).item()])
return answers
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
if isinstance(inputs, list):
return sum((self.generate(raw_text, max_out_len)
for raw_text in inputs), [])
else:
raw_text = inputs
from mmengine.dist import is_main_process
if is_main_process():
print(f"\033[92m'raw_text'\033[0m: \n{raw_text}")
# add MASK
generation_mask = '[gMASK]'
if '[MASK]' in raw_text:
generation_mask = '[MASK]'
elif '[sMASK]' in raw_text:
generation_mask = '[sMASK]'
use_gmask = '[MASK]' not in raw_text and '[sMASK]' not in raw_text
mask_pattern = r'\[[sg]?MASK\]'
text_list = re.split(mask_pattern, raw_text)
pattern_list = re.compile(mask_pattern).findall(raw_text)
seq = []
for i in range(len(pattern_list)):
pattern = pattern_list[i]
sub_text = text_list[i]
seq.extend(self.tokenizer.tokenize(sub_text))
seq.append(self.tokenizer.get_command(pattern))
seq.extend(self.tokenizer.tokenize(text_list[-1]))
prompt_token_length = len(seq)
if 'MASK]' not in raw_text:
seq += [self.tokenizer.get_command(generation_mask)]
raw_text += ' ' + generation_mask
if not raw_text.endswith('MASK]'):
seq = seq + [self.tokenizer.get_command('eos')]
if len(seq) > self.args.max_sequence_length:
raise ValueError('text too long.')
# generation
output_list = [seq]
if self.args.sampling_strategy == 'BeamSearchStrategy':
num_output = self.args.num_beams
else:
num_output = 1
last_pos = [0] * num_output
# continually detect the first mark position
while True:
seq = output_list[0]
# detect mask position
mask_token = self.tokenizer.get_command(generation_mask)
if mask_token not in seq:
break
mask_position = seq.index(mask_token)
output_list = []
input_seq = torch.cuda.LongTensor(
[seq + [self.tokenizer.get_command('sop')]],
device=self.device,
)
output, _ = self.batch_filling_sequence(
self.model,
input_seq,
torch.cuda.LongTensor([input_seq.shape[-1]],
device=self.device),
strategy=self.strategy,
get_masks_and_position_ids=partial(
self.get_masks_and_position_ids,
mask_position=mask_position,
max_gen_length=max_out_len,
gmask=use_gmask,
),
)
if isinstance(output, torch.Tensor): # different strategies
output = output.tolist()
output = output[0] # batch_size = 1
output_list.extend(output)
# clip -1s and fill back generated things into seq
for i in range(len(output_list)):
output = output_list[i].tolist() if isinstance(
output_list[i], torch.Tensor) else output_list[i]
try:
unfinished = output.index(-1)
except ValueError:
unfinished = len(output)
if output[unfinished - 1] in self.strategy.end_tokens:
unfinished -= 1
bog = output.index(self.tokenizer.get_command('sop'))
last_pos[i] = mask_position + unfinished - (bog + 1)
output_list[i] = output[:mask_position] + output[
bog + 1:unfinished] + output[mask_position + 1:bog]
# Select the best answer
output = output_list[0]
if output[-1] == self.tokenizer.get_command('eos'):
output = output[:-1]
# Avoid generate out-of-range id, replace to unk
output = np.array(output)
output[output < 20000] = 20000
output = output.tolist()
answer = self.tokenizer.detokenize(output[prompt_token_length:])
if is_main_process():
print(f"\033[92m'answer'\033[0m: \n{answer}")
return [answer]
def get_logits(self, inputs: List[str]):
mask_id = self.tokenizer.get_command('[MASK]')
sop_id = self.tokenizer.get_command('sop')
tokens = []
targets = []
position_ids = []
attn_masks = []
from mmengine.dist import is_main_process
for raw_text in inputs:
mask_pattern = r'\[MASK\]'
text_list = re.split(mask_pattern, raw_text, 1)
token = sum([
self.tokenizer.tokenize(text_list[0]),
[mask_id, sop_id],
self.tokenizer.tokenize(text_list[1]),
], [])[:-1]
target = sum([
self.tokenizer.tokenize(text_list[0]),
[mask_id],
self.tokenizer.tokenize(text_list[1]),
], [])
if is_main_process():
print(f"\033[92m'raw_text'\033[0m: {raw_text}")
print(f"\033[92m'token'\033[0m: {token}")
seq_length = len(token)
attn_mask = np.ones((seq_length, seq_length), dtype=np.int64)
tokens.append(np.array(token, dtype=np.int64))
targets.append(np.array(target, dtype=np.int64))
position_ids.append(np.arange(0, seq_length, dtype=np.int64))
attn_masks.append(attn_mask)
TILE = 32
length_to_pad = (max(map(len, tokens)) + TILE - 1) // TILE * TILE
token_batch, target_batch, position_id_batch, attention_mask_batch = [], [], [], [] # noqa: E501
for token, target, position_id, attn_mask in zip(
tokens, targets, position_ids, attn_masks):
attn_mask = np.pad(
attn_mask,
pad_width=((0, length_to_pad - len(token)), ),
mode='constant',
constant_values=0,
)
token = np.concatenate(
(token, np.zeros(length_to_pad - len(token), dtype=np.int64)))
target = np.concatenate((target,
np.full(length_to_pad - len(target),
-1,
dtype=np.int64)))
position_id = np.concatenate(
(position_id,
np.zeros(length_to_pad - len(position_id), dtype=np.int64)))
token_batch.append(token)
target_batch.append(target)
position_id_batch.append(position_id)
attention_mask_batch.append(attn_mask)
token_batch = torch.tensor(np.array(token_batch),
dtype=torch.int64).to(self.device)
target_batch = torch.tensor(np.array(target_batch),
dtype=torch.int64).to(self.device)
position_id_batch = torch.tensor(np.array(position_id_batch),
dtype=torch.int64).to(self.device)
attention_mask_batch = (torch.tensor(
np.array(attention_mask_batch), dtype=torch.int64) < 0.5).to(
self.device).bool().unsqueeze(1)
logits, *out_per_layers = self.model(token_batch,
position_id_batch,
attention_mask_batch,
log_attention_weights=None)
if is_main_process():
print(f"\033[92m'target_batch'\033[0m: {target_batch}")
return logits, target_batch
def get_ppl(self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
logits, targets = self.get_logits(inputs)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1)
loss = loss_fn(logits.view(-1, logits.size(-1)),
targets.view(-1)).view(targets.size())
from mmengine.dist import is_main_process
if is_main_process():
print(f"\033[92m'loss'\033[0m: {loss}")
if mask_length is not None:
mask = torch.zeros_like(targets) # [batch,seqlen]
for i in range(len(mask)):
for j in range(mask_length[i] - 1, len(mask[i])):
mask[i][j] = 1
loss = loss * mask
lens = (targets != -1).sum(-1).cpu().numpy()
if mask_length is not None:
lens -= np.array(mask_length)
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
if is_main_process():
print(f"\033[92m'lens'\033[0m: {lens}")
print(f"\033[92m'ce_loss'\033[0m: {ce_loss}")
return ce_loss
import os
from typing import Dict, List, Optional, Union
import numpy as np
import torch
from opencompass.models.base import BaseModel
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
@MODELS.register_module()
class HuggingFace(BaseModel):
"""Model wrapper around HuggingFace general models.
Args:
path (str): The name or path to HuggingFace's model.
hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will
use the env variable HF_MODEL_HUB. Defaults to None.
max_seq_len (int): The maximum length of the input sequence. Defaults
to 2048.
tokenizer_path (str): The path to the tokenizer. Defaults to None.
tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
Defaults to {}.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
model_kwargs (dict): Keyword arguments for the model, used in loader.
Defaults to dict(device_map='auto').
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
extract_pred_after_decode (bool): Whether to extract the prediction
string from the decoded output string, instead of extract the
prediction tokens before decoding. Defaults to False.
batch_padding (bool): If False, inference with be performed in for-loop
without batch padding.
Note:
About ``extract_pred_after_decode``: Commonly, we should extract the
the prediction tokens before decoding. But for some tokenizers using
``sentencepiece``, like LLaMA, this behavior may change the number of
whitespaces, which is harmful for Python programming tasks.
"""
def __init__(self,
path: str,
hf_cache_dir: Optional[str] = None,
max_seq_len: int = 2048,
tokenizer_path: Optional[str] = None,
tokenizer_kwargs: dict = dict(),
tokenizer_only: bool = False,
model_kwargs: dict = dict(device_map='auto'),
meta_template: Optional[Dict] = None,
extract_pred_after_decode: bool = False,
batch_padding: bool = False):
super().__init__(path=path,
max_seq_len=max_seq_len,
tokenizer_only=tokenizer_only,
meta_template=meta_template)
from opencompass.utils.fileio import patch_hf_auto_model
if hf_cache_dir is None:
hf_cache_dir = os.getenv('HF_MODEL_HUB', None)
patch_hf_auto_model(hf_cache_dir)
self.logger = get_logger()
self._load_tokenizer(path=path,
tokenizer_path=tokenizer_path,
tokenizer_kwargs=tokenizer_kwargs)
self.batch_padding = batch_padding
self.extract_pred_after_decode = extract_pred_after_decode
if not tokenizer_only:
self._load_model(path=path, model_kwargs=model_kwargs)
def _load_tokenizer(self, path: str, tokenizer_path: Optional[str],
tokenizer_kwargs: dict):
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path if tokenizer_path else path, **tokenizer_kwargs)
if self.tokenizer.pad_token_id is None:
self.logger.warning('pad_token_id is not set for the tokenizer. '
'Using eos_token_id as pad_token_id.')
self.tokenizer.pad_token = self.tokenizer.eos_token
# A patch for llama when batch_padding = True
if 'decapoda-research/llama' in path or \
(tokenizer_path and
'decapoda-research/llama' in tokenizer_path):
self.logger.warning('We set new pad_token_id for LLaMA model')
# keep consistent with official LLaMA repo
# https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb # noqa
self.tokenizer.bos_token = '<s>'
self.tokenizer.eos_token = '</s>'
self.tokenizer.pad_token_id = 0
def _load_model(self, path: str, model_kwargs: dict):
from transformers import AutoModel
model_kwargs.setdefault('torch_dtype', torch.float16)
self.model = AutoModel.from_pretrained(path, **model_kwargs)
self.model.eval()
# A patch for llama when batch_padding = True
if 'decapoda-research/llama' in path:
self.model.config.bos_token_id = 1
self.model.config.eos_token_id = 2
self.model.config.pad_token_id = self.tokenizer.pad_token_id
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
if self.batch_padding and len(inputs) > 1:
return self._batch_generate(inputs=inputs, max_out_len=max_out_len)
else:
return sum((self._single_generate(inputs=[input_],
max_out_len=max_out_len)
for input_ in inputs), [])
def _batch_generate(self, inputs: List[str],
max_out_len: int) -> List[str]:
"""Support for batch prompts inference.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
if self.extract_pred_after_decode:
prompt_lens = [len(input_) for input_ in inputs]
# step-1: tokenize the input with batch_encode_plus
tokens = self.tokenizer.batch_encode_plus(inputs,
padding=True,
truncation=True,
max_length=self.max_seq_len - max_out_len)
tokens = {
k: torch.tensor(np.array(tokens[k]), device=self.model.device)
for k in tokens if k in ['input_ids', 'attention_mask']
}
# step-2: conduct model forward to generate output
outputs = self.model.generate(**tokens, max_new_tokens=max_out_len)
if not self.extract_pred_after_decode:
outputs = outputs[:, tokens['input_ids'].shape[1]:]
decodeds = self.tokenizer.batch_decode(outputs,
skip_special_tokens=True)
if self.extract_pred_after_decode:
decodeds = [
token[len_:] for token, len_ in zip(decodeds, prompt_lens)
]
return decodeds
def _single_generate(self, inputs: List[str],
max_out_len: int) -> List[str]:
"""Support for single prompt inference.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
if self.extract_pred_after_decode:
prompt_lens = [len(input_) for input_ in inputs]
input_ids = self.tokenizer(inputs,
truncation=True,
max_length=self.max_seq_len - max_out_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
outputs = self.model.generate(input_ids,
max_new_tokens=max_out_len)
if not self.extract_pred_after_decode:
outputs = outputs[:, input_ids.shape[1]:]
decodeds = self.tokenizer.batch_decode(outputs,
skip_special_tokens=True)
if self.extract_pred_after_decode:
decodeds = [
token[len_:] for token, len_ in zip(decodeds, prompt_lens)
]
return decodeds
def get_logits(self, inputs: List[str]):
if self.batch_padding and len(inputs) > 1:
# batch inference
tokens = self.tokenizer(inputs,
padding=True,
truncation=True,
max_length=self.max_seq_len)
tokens = {
k: torch.tensor(np.array(tokens[k]), device=self.model.device)
for k in tokens if k in ['input_ids', 'attention_mask']
}
outputs = self.model(**tokens)
else:
input_ids = self.tokenizer(
inputs,
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
tokens = {'input_ids': input_ids}
outputs = self.model(input_ids)
return outputs[0], {'tokens': tokens}
def get_ppl(self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
if self.batch_padding and len(inputs) > 1:
assert self.tokenizer.pad_token
return self._get_ppl(inputs, mask_length=mask_length)
else:
return np.concatenate([
self._get_ppl(inputs=[text], mask_length=mask_length)
for text in inputs
])
def _get_ppl(self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
outputs, inputs = self.get_logits(inputs)
shift_logits = outputs[..., :-1, :].contiguous()
shift_labels = inputs['tokens']['input_ids'][..., 1:].contiguous()
loss_fct = torch.nn.CrossEntropyLoss(
reduction='none', ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1)).view(shift_labels.size())
if mask_length is not None:
mask = torch.zeros_like(shift_labels) # [batch,seqlen]
for i in range(len(mask)):
for j in range(mask_length[i] - 1, len(mask[i])):
mask[i][j] = 1
loss = loss * mask
lens = (inputs['tokens']['input_ids'] !=
self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
if mask_length is not None:
lens -= np.array(mask_length)
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
return ce_loss
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
return len(self.tokenizer.encode(prompt))
@MODELS.register_module()
class HuggingFaceCausalLM(HuggingFace):
"""Model wrapper around HuggingFace CausalLM.
Args:
path (str): The name or path to HuggingFace's model.
hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will
use the env variable HF_MODEL_HUB. Defaults to None.
max_seq_len (int): The maximum length of the input sequence. Defaults
to 2048.
tokenizer_path (str): The path to the tokenizer. Defaults to None.
tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
Defaults to {}.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
model_kwargs (dict): Keyword arguments for the model, used in loader.
Defaults to dict(device_map='auto').
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
batch_padding (bool): If False, inference with be performed in for-loop
without batch padding.
"""
def _load_model(self, path: str, model_kwargs: dict):
from transformers import AutoModelForCausalLM
model_kwargs.setdefault('torch_dtype', torch.float16)
self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
self.model.eval()
import json
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.registry import MODELS
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
@MODELS.register_module(name=['XunFei'])
class XunFei(BaseAPIModel):
"""Model wrapper around OpenAI-AllesAPIN.
Args:
path (str): The name of OpenAI's model.
max_seq_len (int): Unused here.
call_interval (float): The minimum time interval in seconds between two
calls to the API. Defaults to 1.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(self,
path: str,
appid: str,
api_secret: str,
api_key: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
import ssl
import threading
from urllib.parse import urlencode, urlparse
import websocket
self.urlencode = urlencode
self.websocket = websocket
self.websocket.enableTrace(False)
self.threading = threading
self.ssl = ssl
# weird auth keys
self.APISecret = api_secret
self.APIKey = api_key
self.appid = appid
self.hostname = urlparse(path).netloc
self.hostpath = urlparse(path).path
self.headers = {
'content-type': 'application/json',
}
def get_url(self):
from datetime import datetime
from time import mktime
from wsgiref.handlers import format_date_time
cur_time = datetime.now()
date = format_date_time(mktime(cur_time.timetuple()))
tmp = f'host: {self.hostname}\n'
tmp += 'date: ' + date + '\n'
tmp += 'GET ' + self.hostpath + ' HTTP/1.1'
import hashlib
import hmac
tmp_sha = hmac.new(self.APISecret.encode('utf-8'),
tmp.encode('utf-8'),
digestmod=hashlib.sha256).digest()
import base64
signature = base64.b64encode(tmp_sha).decode(encoding='utf-8')
authorization_origin = (f'api_key="{self.APIKey}", '
'algorithm="hmac-sha256", '
'headers="host date request-line", '
f'signature="{signature}"')
authorization = base64.b64encode(
authorization_origin.encode('utf-8')).decode(encoding='utf-8')
v = {
'authorization': authorization,
'date': date,
'host': self.hostname
}
url = self.path + '?' + self.urlencode(v)
return url
def generate(
self,
inputs: List[str or PromptList],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
return results
def _generate(
self,
input: str or PromptList,
max_out_len: int = 512,
) -> List[str]:
"""Generate results given an input.
Args:
inputs (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
# FIXME: messages only contains the last input
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
# word_ctr = 0
# TODO: Implement truncation in PromptList
for item in input:
msg = {'content': item['prompt']}
# if word_ctr >= self.max_seq_len:
# break
# if len(msg['content']) + word_ctr > self.max_seq_len:
# msg['content'] = msg['content'][word_ctr -
# self.max_seq_len:]
# word_ctr += len(msg['content'])
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
messages.append(msg)
# in case the word break results in even number of messages
# if len(messages) > 0 and len(messages) % 2 == 0:
# messages = messages[:-1]
data = {
'header': {
'app_id': self.appid,
},
'parameter': {
'chat': {
'domain': 'general',
'max_tokens': max_out_len,
}
},
'payload': {
'message': {
'text': messages
}
}
}
msg = ''
err_code = None
err_data = None
content_received = self.threading.Event()
def on_open(ws):
nonlocal data
ws.send(json.dumps(data))
def on_message(ws, message):
nonlocal msg, err_code, err_data, content_received
err_data = json.loads(message)
err_code = err_data['header']['code']
if err_code != 0:
content_received.set()
ws.close()
else:
choices = err_data['payload']['choices']
status = choices['status']
msg += choices['text'][0]['content']
if status == 2:
content_received.set()
ws.close()
ws = self.websocket.WebSocketApp(self.get_url(),
on_message=on_message,
on_open=on_open)
ws.appid = self.appid
ws.question = messages[-1]['content']
for _ in range(self.retry):
self.wait()
ws.run_forever(sslopt={'cert_reqs': self.ssl.CERT_NONE})
content_received.wait()
if err_code == 0:
return msg.strip()
if err_code == 10013:
return err_data['header']['message']
raise RuntimeError(f'Code: {err_code}, data: {err_data}')
"""Simple Dataset Reader."""
import random
from typing import Dict, List, Optional, Union
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.registry import ICL_DATASET_READERS
from opencompass.utils.types import (_check_dataset, _check_str,
_check_type_list)
@ICL_DATASET_READERS.register_module()
class DatasetReader:
"""In-conext Learning Dataset Reader Class Generate an DatasetReader
instance through 'dataset'.
Attributes:
dataset (:obj:`Dataset` or :obj:`DatasetDict`): The dataset to be read.
input_columns (:obj:`List[str]` or :obj:`str`): A list of column names
(a string of column name) in the dataset that represent(s) the
input field.
output_column (:obj:`str`): A column name in the dataset that
represents the prediction field.
input_template (:obj:`PromptTemplate`, optional): An instance of the
:obj:`PromptTemplate` class, used to format the input field
content during the retrieval process. (in some retrieval methods)
output_template (:obj:`PromptTemplate`, optional): An instance of the
:obj:`PromptTemplate` class, used to format the output field
content during the retrieval process. (in some learnable retrieval
methods)
train_split (str): The name of the training split. Defaults to 'train'.
train_range (int or float or str, optional): The size of the partial
training dataset to load.
If None, the entire training dataset will be loaded.
If int or float, the random partial dataset will be loaded with the
specified size.
If str, the partial dataset will be loaded with the
specified index list (e.g. "[:100]" for the first 100 examples,
"[100:200]" for the second 100 examples, etc.). Defaults to None.
test_split (str): The name of the test split. Defaults to 'test'.
test_range (int or float or str, optional): The size of the partial
test dataset to load.
If None, the entire test dataset will be loaded.
If int or float, the random partial dataset will be loaded with the
specified size.
If str, the partial dataset will be loaded with the
specified index list (e.g. "[:100]" for the first 100 examples,
"[100:200]" for the second 100 examples, etc.). Defaults to None.
"""
dataset = None
input_template = None
output_template = None
def __init__(self,
dataset: Union[Dataset, DatasetDict, str],
input_columns: Union[List[str], str],
output_column: str,
input_template: Optional[PromptTemplate] = None,
output_template: Optional[PromptTemplate] = None,
train_split: str = 'train',
train_range: Optional[Union[int, float, str]] = None,
test_split: str = 'test',
test_range: Optional[Union[int, float, str]] = None) -> None:
self.input_columns = _check_type_list(input_columns, [List, str])
if isinstance(self.input_columns, str):
self.input_columns = self.input_columns.split()
self.output_column = _check_str(output_column)
train_range = _check_type_list(train_range, [None, int, float, str])
test_range = _check_type_list(test_range, [None, int, float, str])
if input_template is not None:
self.input_template = PromptTemplate._check_prompt_template(
input_template)
if output_template is not None:
self.output_template = PromptTemplate._check_prompt_template(
output_template)
self.dataset = _check_dataset(dataset)
if isinstance(self.dataset, Dataset):
self.dataset = DatasetDict({
'train': self.dataset,
'test': self.dataset
})
# Normalize the dataset so that it has only "train" and "test" splits.
for origin_split, mapped_split, split_range in [[
train_split, 'train', train_range
], [test_split, 'test', test_range]]:
self.dataset[mapped_split] = load_partial_dataset(
self.dataset[origin_split], size=split_range)
def generate_input_field_prompt(self, entry: Dict) -> str:
"""Generate a prompt for the input field based on the provided
:obj:`entry` data.
Args:
entry (:obj:`Dict`): A piece of data to be used for generating the
prompt.
Returns:
:obj:`str`: The generated prompt.
"""
prompt = None
if self.input_template is None:
prompt = ' '.join([str(entry[ctx]) for ctx in self.input_columns])
else:
prompt = self.input_template.generate_item(entry)
return prompt
def generate_input_field_corpus(self,
dataset: Union[Dataset, DatasetDict],
split: Optional[str] = None) -> List[str]:
"""Generate corpus for input field.
Args:
dataset (:obj:`Dataset` or :obj:`DatasetDict`): A
:obj:`datasets.Dataset` or :obj:`datasets.DatasetDict`
instance.
split (:obj:`str`, optional): The split of the dataset to use. If
:obj:`None`, the entire dataset will be used. Defaults to
``None``.
Returns:
:obj:`List[str]`: A list of generated input field prompts.
"""
if split is not None:
dataset = dataset[split]
corpus = []
for entry in dataset:
corpus.append(self.generate_input_field_prompt(entry))
return corpus
def generate_output_field_prompt(self, entry: Dict) -> str:
"""Generate a prompt for the output field based on the provided
:obj:`entry` data.
Args:
entry (:obj:`Dict`): A piece of data to be used for generating the
prompt.
Returns:
:obj:`str`: The generated prompt.
"""
prompt = None
if self.output_template is None:
prompt = str(entry[self.output_column])
else:
prompt = self.output_template.generate_item(entry)
return prompt
def generate_output_field_corpus(self,
dataset: Union[Dataset, DatasetDict],
split: Optional[str] = None) -> List[str]:
"""Generate corpus for output field.
Args:
dataset (:obj:`Dataset` or :obj:`DatasetDict`): A
:obj:`datasets.Dataset` or :obj:`datasets.DatasetDict`
instance.
split (:obj:`str`, optional): The split of the dataset to use.
If :obj:`None`, the entire dataset will be used. Defaults to
``None``.
Returns:
:obj:`List[str]`: A list of generated output field prompts.
"""
if split is not None:
dataset = dataset[split]
corpus = []
for entry in dataset:
corpus.append(self.generate_output_field_prompt(entry))
return corpus
def generate_input_output_field_prompt(self, entry: Dict) -> str:
"""Generate a prompt for the input-output field based on the
provided:obj:`entry` data.
Args:
entry (:obj:`Dict`): A piece of data to be used for generating the
prompt.
Returns:
:obj:`str`: The generated prompt.
"""
prompt = None
if self.input_output_template is None:
prompt = ' '.join([entry[ctx] for ctx in self.input_columns] +
[str(entry[self.output_column])])
else:
prompt = self.input_output_template.generate_item(entry)
return prompt
def _check_dataset_reader(obj) -> 'DatasetReader':
if isinstance(obj, DatasetReader):
return obj
else:
raise TypeError(f'Expected a DatasetReader object, but got {obj}')
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
return self.dataset[idx]
def __repr__(self):
return (f'DatasetReader({{\n dataset: {self.dataset},'
f'\n input_columns: {self.input_columns},\n'
f' output_columns: {self.output_column}\n}})')
def load_partial_dataset(
dataset: Dataset,
size: Optional[Union[int, float, str]] = None) -> Dataset:
"""Load a partial dataset.
Args:
dataset (Dataset): A :obj:`datasets.Dataset` instance.
size (int or float or (int, int), optional): The size of the partial
dataset to load. If None, the entire dataset will be loaded.
If int or float, the random partial dataset will be loaded with the
specified size. If str, the partial dataset will be loaded with the
specified index list (e.g. "[:100]" for the first 100 examples,
"[100:200]" for the second 100 examples, etc.). Defaults to None.
"""
total_size = len(dataset)
index_list = list(range(total_size))
if isinstance(size, (int, float)):
if size >= total_size or size <= 0:
return dataset
if size > 0 and size < 1:
size = int(size * total_size)
rand = random.Random(x=size)
rand.shuffle(index_list)
dataset = dataset.select(index_list[:size])
elif isinstance(size, str):
dataset = dataset.select(eval(f'index_list{size}'))
return dataset
class DatasetEncoder(torch.utils.data.Dataset):
def __init__(self,
datalist: List,
model_name=None,
tokenizer=None) -> None:
self.datalist = datalist
if model_name is None and tokenizer is None:
raise ValueError('model_name and tokenizer could not both be None')
if tokenizer is not None:
self.tokenizer = tokenizer
else:
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.tokenizer.padding_side = 'left'
self.encode_dataset = []
self.init_dataset()
self.datalist_length = len(self.encode_dataset)
def init_dataset(self):
for idx, data in enumerate(self.datalist):
tokenized_data = self.tokenizer.encode_plus(data,
truncation=True,
return_tensors='pt',
verbose=False)
self.encode_dataset.append({
'input_ids':
tokenized_data.input_ids[0],
'attention_mask':
tokenized_data.attention_mask[0],
'metadata': {
'id': idx,
'len': len(tokenized_data.input_ids[0]),
'text': data
}
})
def __len__(self):
return self.datalist_length
def __getitem__(self, idx):
return self.encode_dataset[idx]
from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.text_postprocessors import general_postprocess
from .icl_base_evaluator import BaseEvaluator
@ICL_EVALUATORS.register_module()
class EMEvaluator(BaseEvaluator):
"""Exact match evaluator."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
general_postprocess(prediction) for prediction in predictions
]
processed_answers = [[general_postprocess(j) for j in i]
for i in references]
cnt = 0
for pred, ans, origin_ans in zip(predictions, processed_answers,
references):
if pred in ans or pred in origin_ans:
cnt += 1
score = cnt / len(predictions) * 100
return {'score': score}
from typing import List
import evaluate
from opencompass.registry import ICL_EVALUATORS
from .icl_base_evaluator import BaseEvaluator
class HuggingfaceEvaluator(BaseEvaluator):
"""Use huggingface evaluate module to calculate the target metrics.
Args:
metric (str): Metric name in evaluate module.
"""
def __init__(self, metric: str) -> None:
self.metric = metric
super().__init__()
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
return {
'predictions': predictions,
'references': references,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return scores
def score(self, predictions: List, references: List) -> dict:
"""Calculate scores.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: calculated scores.
"""
if len(predictions) != len(references):
return {'error': 'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'}
metric = evaluate.load(self.metric)
scores = metric.compute(**self._preprocess(predictions, references))
return self._postprocess(scores)
@ICL_EVALUATORS.register_module()
class AccEvaluator(HuggingfaceEvaluator):
"""Accuracy evaluator."""
def __init__(self) -> None:
super().__init__(metric='accuracy')
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
mapping_to_int_dict = {
label: idx
for idx, label in enumerate(set(map(str, references)))
}
pred_set = set(predictions)
for pred in pred_set:
if str(pred) not in mapping_to_int_dict.keys():
mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
golds = [mapping_to_int_dict[str(gold)] for gold in references]
preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
return {
'predictions': preds,
'references': golds,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
scores["accuracy"] *= 100
return scores
@ICL_EVALUATORS.register_module()
class RougeEvaluator(HuggingfaceEvaluator):
"""Rouge evaluator."""
def __init__(self) -> None:
super().__init__(metric='rouge')
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return {k: v * 100 for k, v in scores.items()}
@ICL_EVALUATORS.register_module()
class BleuEvaluator(HuggingfaceEvaluator):
"""Bleu evaluator."""
def __init__(self) -> None:
super().__init__(metric='sacrebleu')
@ICL_EVALUATORS.register_module()
class MccEvaluator(AccEvaluator):
"""Matthews correlation evaluator."""
def __init__(self) -> None:
super(AccEvaluator, self).__init__(metric='matthews_correlation')
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
scores["matthews_correlation"] *= 100
return scores
@ICL_EVALUATORS.register_module()
class SquadEvaluator(HuggingfaceEvaluator):
"""Squad evaluator."""
def __init__(self) -> None:
super().__init__(metric='squad')
def _preprocess(self, predictions: List, references: List) -> dict:
"""Preprocess the final predictions and references to needed format.
Args:
predictions (List): List of predictions of each sample.
references (List): List of targets for each sample.
Returns:
dict: preprocessed results.
"""
p_list = [{
'prediction_text': pred.split('\n')[0],
'id': str(i)
} for i, pred in enumerate(predictions)]
r_list = [{
'answers': {
'answer_start': [0],
'text': [ref]
},
'id': str(i)
} for i, ref in enumerate(references)]
return {
'predictions': p_list,
'references': r_list,
}
def _postprocess(self, scores: dict) -> dict:
"""Postprocess for final scores.
Args:
scores (dict): Dict of calculated scores of metrics.
Returns:
dict: postprocessed scores.
"""
return scores['f1']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment