Commit c94cc943 authored by Leymore's avatar Leymore Committed by gaotong
Browse files

Add release contribution

parent e6b5bdcb
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class AXDataset_V2(BaseDataset):
@staticmethod
def load(path: str):
dataset = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
line['label'] = {
'entailment': 'A',
'not_entailment': 'B'
}[line['label']]
dataset.append(line)
return Dataset.from_list(dataset)
from abc import abstractstaticmethod
from typing import Dict, Optional, Union
from datasets import Dataset, DatasetDict
from opencompass.openicl import DatasetReader
class BaseDataset:
def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs):
self.dataset = self.load(**kwargs)
self._init_reader(**reader_cfg)
def _init_reader(self, **kwargs):
self.reader = DatasetReader(self.dataset, **kwargs)
@property
def train(self):
return self.reader.dataset['train']
@property
def test(self):
return self.reader.dataset['test']
@abstractstaticmethod
def load(**kwargs) -> Union[Dataset, DatasetDict]:
pass
import json
import os.path as osp
import re
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
TEXT_POSTPROCESSORS)
from .base import BaseDataset
@LOAD_DATASET.register_module()
class BBHDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
with open(osp.join(path, f'{name}.json'), 'r') as f:
data = json.load(f)['examples']
dataset = Dataset.from_list(data)
return dataset
@TEXT_POSTPROCESSORS.register_module('bbh-mcq')
def bbh_mcq_postprocess(text: str) -> str:
ans = text
ans_line = ans.split('answer is ')
if len(ans_line) != 1:
ans = ans_line[1].strip()
match = re.search(r'\(([A-Z])\)*', ans)
if match:
return match.group(1)
match = re.search(r'([A-Z])', ans)
if match:
return match.group(1)
return ans
@TEXT_POSTPROCESSORS.register_module('bbh-freeform')
def bbh_freeform_postprocess(text: str) -> str:
ans = text
ans_line = ans.split('answer is ')
if len(ans_line) != 1:
ans = ans_line[1].strip()
ans = ans.split('\n')[0]
if ans.endswith('.'):
ans = ans[:-1]
return ans
@ICL_EVALUATORS.register_module()
class BBHEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
cnt = 0
for pred, ref in zip(predictions, references):
if pred == ref:
cnt += 1
score = cnt / len(predictions) * 100
return {'score': score}
import json
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class BoolQDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
if example['label'] == 'true':
example['answer'] = 1
else:
example['answer'] = 0
return example
dataset = dataset.map(preprocess)
return dataset
@LOAD_DATASET.register_module()
class BoolQDataset_V2(BaseDataset):
@staticmethod
def load(path):
dataset = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
line['label'] = {'true': 'A', 'false': 'B'}[line['label']]
dataset.append(line)
return Dataset.from_list(dataset)
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class bustumDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
line['label'] = 'AB'[int(line['label'])]
data.append(line)
return Dataset.from_list(data)
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class CMRCDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
data = json.load(f)
# 将原始数据转换为所需的格式
rows = []
for index, paragraphs in enumerate(data['data']):
for paragraph in paragraphs['paragraphs']:
context = paragraph['context']
for question in paragraph['qas']:
answers = question['answers']
unique_answers = list(set([a['text'] for a in answers]))
rows.append({
'context': context,
'question': question['question'],
'answers': unique_answers
})
# 创建 Dataset
dataset = Dataset.from_dict({
'context': [row['context'] for row in rows],
'question': [row['question'] for row in rows],
'answers': [row['answers'] for row in rows]
})
return dataset
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class GovRepcrsDataset(BaseDataset):
@staticmethod
def load(path: str):
import json
import os
dataset_dict = DatasetDict()
splits = ['train', 'valid', 'test']
dataset_lists = {x: [] for x in splits}
for split in splits:
split_fp = os.path.join(path, 'gov-report', 'split_ids',
'crs_' + split + '.ids')
with open(split_fp, 'r') as f:
for line in f.readlines():
xpath = os.path.join(path, 'gov-report', 'crs',
line.strip() + '.json')
with open(xpath, 'r') as df:
data = json.load(df)
content = data['title'] + '\n' + '\n'.join(
[(x['section_title'] if x['section_title'] else '')
+ '\n' + '\n'.join(x['paragraphs'])
for x in data['reports']['subsections']])
summary = '\n'.join(data['summary'])
dataset_lists[split].append({
'content': content,
'summary': summary,
})
dataset_dict[split] = Dataset.from_list(dataset_lists[split])
return dataset_dict
from datasets import load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class IWSLT2017Dataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
dataset = dataset.map(lambda example: example['translation']
).remove_columns('translation')
return dataset
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class NarrativeQADataset(BaseDataset):
@staticmethod
def load(path: str):
import csv
import os
dataset_dict = DatasetDict()
splits = ['train', 'valid', 'test']
dataset_lists = {x: [] for x in splits}
with open(os.path.join(path, 'qaps.csv'), 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
if row[1] == 'set':
continue
split = row[1] # set
answers = [row[3], row[4]] # row['answer1'], row['answer2']
question = row[2] # question
x_path = os.path.join(path, 'tmp',
row[0] + '.content') # document_id
try:
with open(x_path, 'r', encoding='utf-8') as f:
evidence = f.read(100000)
except: # noqa: E722
continue
dataset_lists[split].append({
'answer': answers,
'question': question,
'evidence': evidence,
})
for split in splits:
dataset_dict[split] = Dataset.from_list(dataset_lists[split])
return dataset_dict
import json
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class WSCDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
text_list = example['text'].split(' ')
assert ' ' not in example['target']['span2_text']
# span1 may have 1 or more than 1 words
# span2 is the pronoun and has only 1 word
text_list[example['target']
['span2_index']] = example['target']['span1_text']
example['new_text'] = ' '.join(text_list)
if example['label'] == 'true':
example['answer'] = 1
else:
example['answer'] = 0
example['span1'] = example['target']['span1_text']
example['span2'] = example['target']['span2_text']
del example['target']
return example
dataset = dataset.map(preprocess)
return dataset
@LOAD_DATASET.register_module()
class WSCDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
item = {
'span1': line['target']['span1_text'],
'span2': line['target']['span2_text'],
'text': line['text'],
'label': {
'true': 'A',
'false': 'B'
}[line['label']],
}
data.append(item)
return Dataset.from_list(data)
import os
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.registry import MODELS
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
@MODELS.register_module()
class OpenAI(BaseAPIModel):
"""Model wrapper around OpenAI's models.
Args:
path (str): The name of OpenAI's model.
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
retry (int): Number of retires if the API call fails. Defaults to 2.
key (str): OpenAI key. In particular, when it is set to "ENV", the key
will be fetched from the environment variable $OPENAI_API_KEY, as
how openai defaults to be. Defaults to 'ENV'
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
openai_api_base (str): The base url of OpenAI's API. Defaults to
'https://api.openai.com/v1'.
"""
is_api: bool = True
def __init__(self,
path: str,
max_seq_len: int = 2048,
query_per_second: int = 1,
retry: int = 2,
key: str = 'ENV',
meta_template: Optional[Dict] = None,
openai_api_base: str = 'https://api.openai.com/v1'):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template,
query_per_second=query_per_second,
retry=retry)
import openai
import tiktoken
self.openai = openai
self.tiktoken = tiktoken
self.openai.api_key = os.getenv(
'OPENAI_API_KEY') if key == 'ENV' else key
self.openai.api_rase = openai_api_base
def generate(
self,
inputs: List[str or PromptList],
max_out_len: int = 512,
temperature: float = 0.7,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs)))
return results
def _generate(self, input: str or PromptList, max_out_len: int,
temperature: float) -> str:
"""Generate results given a list of inputs.
Args:
inputs (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
# max num token for gpt-3.5-turbo is 4097
max_out_len = min(max_out_len, 4000 - self.get_token_len(str(input)))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
max_num_retries = 0
while max_num_retries < self.retry:
self.wait()
try:
response = self.openai.ChatCompletion.create(
model=self.path,
messages=messages,
max_tokens=max_out_len,
n=1,
stop=None,
temperature=temperature,
)
except self.openai.error.RateLimitError:
max_num_retries -= 1
max_num_retries += 1
result = response.choices[0].message.content.strip()
return result
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized string. Only English and Chinese
characters are counted for now. Users are encouraged to override this
method if more accurate length is needed.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
enc = self.tiktoken.encoding_for_model(self.path)
return len(enc.encode(prompt))
"""Basic Inferencer."""
import json
import os
from pathlib import Path
from typing import List, Optional
import numpy as np
from mmengine.dist import is_main_process
from torch.utils.data import DataLoader
from ..icl_prompt_template import PromptTemplate
from ..icl_retriever import BaseRetriever
class BaseInferencer:
"""Base Inferencer class for all evaluation Inferencer.
Attributes:
model (:obj:`BaseModel`, optional): The module to inference.
max_model_token_num (:obj:`int`, optional): Maximum number of
tokenized words allowed by the LM.
batch_size (:obj:`int`, optional): Batch size for the
:obj:`DataLoader`.
output_json_filepath (:obj:`str`, optional): File path for output
`JSON` file.
output_json_filename (:obj:`str`, optional): File name for output
`JSON` file.
api_name (:obj:`str`, optional): Name of API service.
call_api (:obj:`bool`): If ``True``, an API for LM models will be used,
determined by :obj:`api_name`.
"""
model = None
def __init__(
self,
model,
max_seq_len: Optional[int] = None,
batch_size: Optional[int] = 1,
output_json_filepath: Optional[str] = './icl_inference_output',
output_json_filename: Optional[str] = 'predictions',
**kwargs,
) -> None:
self.model = model
self.max_seq_len = max_seq_len
self.batch_size = batch_size
self.output_json_filepath = output_json_filepath
self.output_json_filename = output_json_filename
self.is_main_process = is_main_process()
if not os.path.exists(self.output_json_filepath):
os.makedirs(self.output_json_filepath)
def inference(self,
retriever: BaseRetriever,
ice_template: Optional[PromptTemplate] = None,
prompt_template: Optional[PromptTemplate] = None,
output_json_filepath: Optional[str] = None,
output_json_filename: Optional[str] = None) -> List:
"""Perform In-Context Inference given a retriever and optional
templates.
Args:
retriever (:obj:`BaseRetriever`): An instance of a Retriever class
that will be used to retrieve in-context examples
ice_template (:obj:`PromptTemplate`, optional): A template for
generating the in-context examples prompt. Defaults to None.
prompt_template (:obj:`PromptTemplate`, optional): A template for
generating the final prompt. Defaults to None.
output_json_filepath (:obj:`str`, optional): The file path to save
the results as a `JSON` file. Defaults to None.
output_json_filename (:obj:`str`, optional): The file name to save
the results as a `JSON` file. Defaults to None.
Raises:
NotImplementedError: If the function is not implemented in the
subclass.
Returns:
:obj:`List:` A list of string, each representing the results of one
inference.
"""
raise NotImplementedError("Method hasn't been implemented yet")
@staticmethod
def get_dataloader(datalist: List[List], batch_size: int) -> DataLoader:
"""Return a dataloader of the input data list."""
dataloader = DataLoader(datalist,
batch_size=batch_size,
collate_fn=lambda x: x)
return dataloader
def dump_results_dict(results_dict, filename):
with open(filename, 'w', encoding='utf-8') as json_file:
json.dump(results_dict, json_file, indent=4, ensure_ascii=False)
class GenInferencerOutputHandler:
origin_prompt_dict = {}
output_dict = {}
prediction_dict = {}
results_dict = {}
def __init__(self) -> None:
self.results_dict = {}
def write_to_json(self, save_dir: str, filename: str):
"""Dump the result to a json file."""
dump_results_dict(self.results_dict, Path(save_dir) / filename)
def save_results(self, origin_prompt, prediction, idx):
self.results_dict[str(idx)] = {
'origin_prompt': origin_prompt,
'prediction': prediction,
}
class PPLInferencerOutputHandler:
results_dict = {}
def __init__(self) -> None:
self.results_dict = {}
def write_to_json(self, save_dir: str, filename: str):
"""Dump the result to a json file."""
dump_results_dict(self.results_dict, Path(save_dir) / filename)
def save_ice(self, ice):
for idx, example in enumerate(ice):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
self.results_dict[str(idx)]['in-context examples'] = example
def save_predictions(self, predictions):
for idx, prediction in enumerate(predictions):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
self.results_dict[str(idx)]['prediction'] = prediction
def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
self.results_dict[str(idx)]['label: ' + str(label)] = {}
self.results_dict[str(idx)]['label: ' +
str(label)]['testing input'] = input
self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
self.results_dict[str(idx)]['label: ' + str(label)]['PPL'] = ppl
def save_prompt_and_condprob(self, input, prompt, cond_prob, idx, choices):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
# TODO:
# for single token situation, the input will always be yes currently
self.results_dict[str(idx)]['testing input'] = input
self.results_dict[str(idx)]['prompt'] = prompt
# TODO: hard code here
self.results_dict[str(idx)]['choices'] = choices
# For calculate auc scores, set scores as prediction
self.results_dict[str(idx)]['prediction'] = cond_prob
# set pred label in case needed
self.results_dict[str(idx)]['pred_label'] = int(np.argmax(cond_prob))
"""MDL Retriever."""
from typing import List, Optional
import numpy as np
import torch
import tqdm
from transformers import AutoModelForCausalLM
from opencompass.openicl import PromptTemplate
from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever
from opencompass.openicl.utils.logging import get_logger
from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
logger = get_logger(__name__)
@ICL_RETRIEVERS.register_module()
class MDLRetriever(TopkRetriever):
"""MDL Retriever, subclass of `TopkRetriever`. MDL is a abbreviation of
Minimum Description Length, specially designed for ppl evaluation. You may
refer to the paper for more details: https://arxiv.org/pdf/2212.10375.pdf.
Args:
dataset (`BaseDataset`): Any BaseDataset instances.
Attributes of ``reader``, ``train`` and ``test`` will be used.
ice_separator (`Optional[str]`): The separator between each in-context
example template when origin `PromptTemplate` is provided. Defaults
to '\n'.
ice_eos_token (`Optional[str]`): The end of sentence token for
in-context example template when origin `PromptTemplate` is
provided. Defaults to '\n'.
ice_num (`Optional[int]`): The number of in-context example template
when origin `PromptTemplate` is provided. Defaults to 1.
sentence_transformers_model_name (`Optional[str]`): The name of the
sentence transformers model. Defaults to 'all-mpnet-base-v2'.
tokenizer_name (`Optional[str]`): The name of the tokenizer. Defaults
to 'gpt2-xl'.
batch_size (`Optional[int]`): The batch size for the dataloader.
Defaults to 1.
candidate_num (`Optional[int]`): The number of candidates to retrieve
for each example. Defaults to 1.
ce_model_name (`Optional[str]`): The name of the model for calculating
MDL. Defaults to 'gpt2-xl'.
select_time (`Optional[int]`): The number of times to select MDL.
Defaults to 5.
ice_template (`Optional[PromptTemplate]`): The template for in-context
example. Defaults to None.
prompt_template (`Optional[PromptTemplate]`): The template for prompt.
Defaults to None.
labels (`Optional[List]`): The labels for calculating MDL. Defaults to
None.
seed (`Optional[int]`): The seed for random. Defaults to 1.
"""
metric_model = None
def __init__(self,
dataset,
ice_separator: Optional[str] = '\n',
ice_eos_token: Optional[str] = '\n',
ice_num: Optional[int] = 1,
sentence_transformers_model_name: Optional[
str] = 'all-mpnet-base-v2',
tokenizer_name: Optional[str] = 'gpt2-xl',
batch_size: Optional[int] = 1,
candidate_num: Optional[int] = 1,
ce_model_name: Optional[str] = 'gpt2-xl',
select_time: Optional[int] = 5,
ice_template: Optional[PromptTemplate] = None,
prompt_template: Optional[PromptTemplate] = None,
labels: Optional[List] = None,
seed: Optional[int] = 1) -> None:
super().__init__(dataset, ice_separator, ice_eos_token, ice_num,
sentence_transformers_model_name, tokenizer_name,
batch_size)
self.ce_model_name = ce_model_name
self.candidate_num = candidate_num
self.select_time = select_time
self.ice_template = ICL_PROMPT_TEMPLATES.build(ice_template)
if prompt_template is not None:
self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
else:
self.prompt_template = None
self.labels = labels
self.seed = seed
def topk_search(self):
np.random.seed(self.seed)
res_list = self.forward(self.dataloader)
rtr_idx_list = [[] for _ in range(len(res_list))]
logger.info('Retrieving data for test set...')
for entry in tqdm.tqdm(res_list, disable=not self.is_main_process):
idx = entry['metadata']['id']
embed = np.expand_dims(entry['embed'], axis=0)
near_ids = self.index.search(
embed, min(self.candidate_num,
len(self.index_ds)))[1][0].tolist()
candidates = []
mdl_scores = []
for j in range(self.select_time):
if j == 0:
rand_idx_list = near_ids[:self.ice_num]
else:
rand_idx_list = np.random.choice(near_ids,
self.ice_num,
replace=False)
rand_idx_list = [int(i) for i in rand_idx_list]
candidates.append(rand_idx_list)
ice = self.generate_ice(rand_idx_list,
ice_template=self.ice_template)
ice = str(ice)
mask_length = len(
self.tokenizer(ice + self.ice_eos_token,
verbose=False)['input_ids'])
if self.labels is None:
labels = self.get_labels(self.ice_template,
self.prompt_template)
else:
labels = self.labels
prompt_list = []
for label in labels:
prompt = self.generate_label_prompt(
idx, ice, label, self.ice_template,
self.prompt_template)
prompt = str(prompt)
prompt_list.append(prompt)
loss_list = self.cal_ce(prompt_list, mask_length=mask_length)
probs = np.exp(-np.array(loss_list))
normalized_probs = probs / probs.sum(0, keepdims=True)
neg_entropy = -entropy(normalized_probs, label_dim=0)
mdl_scores.append(neg_entropy)
rtr_idx_list[idx] = candidates[mdl_scores.index(max(mdl_scores))]
rtr_idx_list[idx] = [int(i) for i in rtr_idx_list[idx]]
return rtr_idx_list
def retrieve(self):
"""Retrieve the in-context example index for each test example."""
return self.topk_search()
def cal_ce(self, input_texts: List[str], mask_length=None):
if self.metric_model is None:
logger.info(
f'Load model {self.ce_model_name} for calculating MDL...')
self.metric_model = AutoModelForCausalLM.from_pretrained(
self.ce_model_name)
self.metric_model.to(self.device)
inputs = self.tokenizer(input_texts,
padding=True,
return_tensors='pt',
truncation=True)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
outputs = self.metric_model(**inputs)
shift_logits = outputs.logits[..., :-1, :].contiguous()
shift_labels = inputs['input_ids'][..., 1:].contiguous()
loss_fct = torch.nn.CrossEntropyLoss(
reduction='none', ignore_index=self.tokenizer.pad_token_id)
shift_logits = shift_logits.view(-1, shift_logits.size(-1))
loss = loss_fct(shift_logits,
shift_labels.view(-1)).view(shift_labels.size())
if mask_length is not None:
mask = torch.cat([
torch.zeros([loss.shape[0], mask_length], dtype=torch.float),
torch.ones([loss.shape[0], loss.shape[-1] - mask_length],
dtype=torch.float)
], -1)
mask = mask.to(self.device)
loss = torch.mul(mask, loss)
lens = (inputs['input_ids'] !=
self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
if mask_length is not None:
lens -= mask_length
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
return ce_loss
def entropy(probs: np.array, label_dim: int = 0, mask=None):
if mask is None:
return -(probs * np.log(probs)).sum(label_dim)
return -(mask * probs * np.log(probs)).sum(label_dim)
"""Votek Retriever."""
import json
import os
import random
from collections import defaultdict
from typing import Optional
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever
class VotekRetriever(TopkRetriever):
"""Vote-k In-context Learning Retriever, subclass of `TopkRetriever`.
**WARNING**: This class has not been tested thoroughly. Please use it with
caution.
"""
def __init__(self,
dataset,
ice_separator: Optional[str] = '\n',
ice_eos_token: Optional[str] = '\n',
ice_num: Optional[int] = 1,
sentence_transformers_model_name: Optional[
str] = 'all-mpnet-base-v2',
tokenizer_name: Optional[str] = 'gpt2-xl',
batch_size: Optional[int] = 1,
votek_k: Optional[int] = 3) -> None:
super().__init__(dataset, ice_separator, ice_eos_token, ice_num,
sentence_transformers_model_name, tokenizer_name,
batch_size)
self.votek_k = votek_k
def votek_select(self,
embeddings=None,
select_num=None,
k=None,
overlap_threshold=None,
vote_file=None):
n = len(embeddings)
if vote_file is not None and os.path.isfile(vote_file):
with open(vote_file) as f:
vote_stat = json.load(f)
else:
vote_stat = defaultdict(list)
for i in range(n):
cur_emb = embeddings[i].reshape(1, -1)
cur_scores = np.sum(cosine_similarity(embeddings, cur_emb),
axis=1)
sorted_indices = np.argsort(cur_scores).tolist()[-k - 1:-1]
for idx in sorted_indices:
if idx != i:
vote_stat[idx].append(i)
if vote_file is not None:
with open(vote_file, 'w', encoding='utf-8') as f:
json.dump(vote_stat, f)
votes = sorted(vote_stat.items(),
key=lambda x: len(x[1]),
reverse=True)
j = 0
selected_indices = []
while len(selected_indices) < select_num and j < len(votes):
candidate_set = set(votes[j][1])
flag = True
for pre in range(j):
cur_set = set(votes[pre][1])
if len(candidate_set.intersection(
cur_set)) >= overlap_threshold * len(candidate_set):
flag = False
break
if not flag:
j += 1
continue
selected_indices.append(int(votes[j][0]))
j += 1
if len(selected_indices) < select_num:
unselected_indices = []
cur_num = len(selected_indices)
for i in range(n):
if i not in selected_indices:
unselected_indices.append(i)
selected_indices += random.sample(unselected_indices,
select_num - cur_num)
return selected_indices
def vote_k_search(self):
vote_k_idxs = self.votek_select(embeddings=self.embed_list,
select_num=self.ice_num,
k=self.votek_k,
overlap_threshold=1)
return [vote_k_idxs[:] for _ in range(len(self.test_ds))]
def retrieve(self):
return self.vote_k_search()
import json
import os
import time
import numpy as np
import openai
import requests
OPENICL_API_NAME_LIST = ['opt-175b', 'gpt3']
OPENICL_API_PARAMETER_DICT = {
'opt-175b': ['URL', 'headers'],
'gpt3': [
'engine', 'temperature', 'max_tokens', 'top_p', 'frequency_penalty',
'presence_penalty', 'sleep_time'
]
}
OPENICL_API_REQUEST_CONFIG = {
'opt-175b': {
'URL': '', # http://xxx/completions or http://xxx/generate
'headers': {
'Content-Type': 'application/json; charset=UTF-8'
}
},
'gpt3': {
'engine': 'text-davinci-003',
'temperature': 0,
'max_tokens': 256,
'top_p': 1.0,
'frequency_penalty': 0.0,
'presence_penalty': 0.0,
'sleep_time': 3
}
}
PROXIES = {'https': '', 'http': ''}
def is_api_available(api_name):
if api_name is None:
return False
return True if api_name in OPENICL_API_NAME_LIST else False
def update_openicl_api_request_config(api_name, **kwargs):
if api_name is None or not is_api_available(api_name):
return
parameter_list = OPENICL_API_PARAMETER_DICT[api_name]
for parameter in parameter_list:
if parameter in kwargs.keys():
OPENICL_API_REQUEST_CONFIG[api_name][parameter] = kwargs[parameter]
def api_get_ppl(api_name, input_texts):
if api_name == 'opt-175b':
pyload = {'prompt': input_texts, 'max_tokens': 0, 'echo': True}
response = json.loads(
requests.post(
OPENICL_API_REQUEST_CONFIG[api_name]['URL'],
data=json.dumps(pyload),
headers=OPENICL_API_REQUEST_CONFIG[api_name]['headers'],
proxies=PROXIES).text)
lens = np.array(
[len(r['logprobs']['tokens']) for r in response['choices']])
ce_loss = np.array([
-sum(r['logprobs']['token_logprobs']) for r in response['choices']
])
return ce_loss / lens
if api_name == 'gpt3':
raise NotImplementedError("GPT-3 API doesn't support PPL calculation")
def api_get_tokens(api_name, input_texts):
length_list = [len(text) for text in input_texts]
if api_name == 'opt-175b':
pyload = {'prompt': input_texts, 'max_tokens': 100, 'echo': True}
response = json.loads(
requests.post(
OPENICL_API_REQUEST_CONFIG[api_name]['URL'],
data=json.dumps(pyload),
headers=OPENICL_API_REQUEST_CONFIG[api_name]['headers'],
proxies=PROXIES).text)
return [r['text'] for r in response['choices']], [
r['text'][length:]
for r, length in zip(response['choices'], length_list)
]
if api_name == 'gpt3':
openai.api_key = os.getenv('OPENAI_API_KEY')
response = openai.Completion.create(
engine=OPENICL_API_REQUEST_CONFIG['gpt3']['engine'],
prompt=input_texts,
temperature=OPENICL_API_REQUEST_CONFIG['gpt3']['temperature'],
max_tokens=OPENICL_API_REQUEST_CONFIG['gpt3']['max_tokens'],
top_p=OPENICL_API_REQUEST_CONFIG['gpt3']['top_p'],
frequency_penalty=OPENICL_API_REQUEST_CONFIG['gpt3']
['frequency_penalty'],
presence_penalty=OPENICL_API_REQUEST_CONFIG['gpt3']
['presence_penalty'])
time.sleep(OPENICL_API_REQUEST_CONFIG['gpt3']['sleep_time'])
return [(input + r['text'])
for r, input in zip(response['choices'], input_texts)
], [r['text'] for r in response['choices']]
import os.path as osp
from typing import Dict, List
from mmengine.config import Config, ConfigDict
from opencompass.registry import PARTITIONERS
from opencompass.utils import get_infer_output_path
from .base import BasePartitioner
@PARTITIONERS.register_module()
class NaivePartitioner(BasePartitioner):
"""Naive task partitioner. This partitioner will generate a task for each
model-dataset pair.
Args:
config (ConfigDict): The full config dict.
"""
def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
work_dir: str, out_dir: str) -> List[Dict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
.. code-block:: python
{
'models': [], # a list of model configs
'datasets': [[]], # a nested list of dataset configs, each
list corresponds to a model
'work_dir': '', # the work dir
}
Args:
models (List[ConfigDict]): A list of model configs.
datasets (List[ConfigDict]): A list of dataset configs.
work_dir (str): The work dir for the task.
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
existency of result file in this directory.
Returns:
List[Dict]: A list of tasks.
"""
tasks = []
for model in models:
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
if osp.exists(filename):
continue
task = Config({
'models': [model],
'datasets': [[dataset]],
'work_dir': work_dir
})
tasks.append(task)
return tasks
import copy
import math
import os.path as osp
from typing import List, Tuple, Union
import mmengine
from mmengine.config import Config, ConfigDict
from opencompass.registry import PARTITIONERS
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path)
from .base import BasePartitioner
@PARTITIONERS.register_module()
class SizePartitioner(BasePartitioner):
"""Task partitioner based on the size of the dataset (with some rough
expansion as an estimation of computational cost).
Args:
out_dir (str): The output directory of tasks.
max_task_size (int): The maximum size of a task.
gen_task_coef (int): The dataset cost measurement coefficient for
generation tasks.
dataset_size_path (str): The path to the dataset size cache file.
"""
def __init__(self,
out_dir: str,
max_task_size: int = 2000,
gen_task_coef: int = 20,
dataset_size_path: str = '.cache/dataset_size.json'):
super().__init__(out_dir)
self.max_task_size = max_task_size
self.gen_task_coef = gen_task_coef
self.dataset_size_path = dataset_size_path
def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
work_dir: str, out_dir: str) -> List[ConfigDict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
.. code-block:: python
{
'models': [], # a list of model configs
'datasets': [[]], # a nested list of dataset configs, each
list corresponds to a model
'work_dir': '', # the work dir
}
Args:
models (List[ConfigDict]): A list of model configs.
datasets (List[ConfigDict]): A list of dataset configs.
work_dir (str): The work dir for the task.
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
existency of result file in this directory.
Returns:
List[ConfigDict]: A list of tasks.
"""
datasets = sorted(datasets,
key=lambda x: self.get_cost(x),
reverse=True)
tasks = []
for model in models:
task = Config({
'models': [model],
'datasets': [[]],
'work_dir': work_dir
})
num_data = 0
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
root, ext = osp.splitext(filename)
# skip the task if the task output exists
if osp.exists(filename):
continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
# skip the task it the task output exists
if not osp.exists(f'{root}_{i}{ext}'):
tasks.append(
Config({
'models': [model],
'datasets': [[dataset_split]],
'work_dir': work_dir
}))
else:
if num_data + dataset_size > self.max_task_size:
tasks.append(task)
task = Config({
'models': [model],
'datasets': [[]],
'work_dir': work_dir
})
num_data = 0
task['datasets'][0].append(dataset)
num_data = num_data + dataset_size
if task['datasets'][0]:
tasks.append(task)
return tasks
@property
def dataset_size(self):
if not hasattr(self, '_dataset_size'):
if osp.exists(self.dataset_size_path):
self._dataset_size = mmengine.load(self.dataset_size_path)
else:
self._dataset_size = {}
return self._dataset_size
def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]:
"""Split dataset into several parts."""
dataset_size, num_repeats = self.get_cost(dataset_cfg,
get_raw_factors=True)
split_configs = []
abbr = dataset_abbr_from_cfg(dataset_cfg)
step = self.max_task_size // num_repeats
# evenly distribute the task
step = math.ceil(dataset_size / math.ceil(dataset_size / step))
for part, i in enumerate(range(0, dataset_size, step)):
cfg = copy.deepcopy(dataset_cfg)
cfg['abbr'] = abbr + f'_{part}'
test_range = cfg['reader_cfg'].get('test_range', '')
cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]'
split_configs.append(cfg)
return split_configs
def get_cost(self,
dataset: ConfigDict,
get_raw_factors: bool = False) -> Union[int, Tuple[int, int]]:
"""Get the computational cost of inferring on the dataset.
Args:
dataset (ConfigDict): The dataset config.
get_raw_factors (bool): If True, the raw factors of computational
cost will be returned.
Returns:
int or Tuple[int, int]: The size of the dataset. If get_raw_factors
is True, the number of repeats will also be returned.
"""
dataset_abbr = dataset_abbr_from_cfg(dataset)
# If it's the PPL template, the dataset size will be multiplied by the
# number of labels
infer_cfg = dataset.infer_cfg
test_range = dataset.reader_cfg.get('test_range', '')
template = (infer_cfg.prompt_template.template if 'prompt_template'
in infer_cfg else infer_cfg.ice_template.template)
# If it's the Gen template, the dataset size will be multiplied by the
# self.gen_task_coef
factor = self.gen_task_coef
if isinstance(template, dict):
ctr = sum(key in template for key in ('begin', 'round', 'end'))
if ctr != len(template.keys()):
factor = len(template.keys())
if dataset_abbr in self.dataset_size:
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
f'{test_range})')
if get_raw_factors:
return actual_size, factor
return factor * actual_size
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,
ensure_ascii=False)
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
f'{test_range})')
if get_raw_factors:
return actual_size, factor
return factor * actual_size
from mmengine.registry import Registry
PARTITIONERS = Registry('partitioner', locations=['opencompass.partitioners'])
RUNNERS = Registry('runner', locations=['opencompass.runners'])
TASKS = Registry('task', locations=['opencompass.tasks'])
MODELS = Registry('model', locations=['opencompass.models'])
# TODO: LOAD_DATASET -> DATASETS
LOAD_DATASET = Registry('load_dataset', locations=['opencompass.datasets'])
TEXT_POSTPROCESSORS = Registry(
'text_postprocessors', locations=['opencompass.utils.text_postprocessors'])
EVALUATORS = Registry('evaluators', locations=['opencompass.evaluators'])
ICL_INFERENCERS = Registry('icl_inferencers',
locations=['opencompass.openicl.icl_inferencer'])
ICL_RETRIEVERS = Registry('icl_retrievers',
locations=['opencompass.openicl.icl_retriever'])
ICL_DATASET_READERS = Registry(
'icl_dataset_readers',
locations=['opencompass.openicl.icl_dataset_reader'])
ICL_PROMPT_TEMPLATES = Registry(
'icl_prompt_templates',
locations=['opencompass.openicl.icl_prompt_template'])
ICL_EVALUATORS = Registry('icl_evaluators',
locations=['opencompass.openicl.icl_evaluator'])
import getpass
from abc import abstractmethod
from typing import Any, Dict, List, Tuple
from mmengine.config import ConfigDict, Config
from opencompass.utils import LarkReporter, get_logger
class BaseRunner:
"""Base class for all runners. A runner is responsible for launching
multiple tasks.
Args:
task (ConfigDict): Task type config.
debug (bool): Whether to run in debug mode.
lark_bot_url (str): Lark bot url.
"""
def __init__(self,
task: ConfigDict,
debug: bool = False,
lark_bot_url: str = None):
self.task_cfg = Config(task)
self.debug = debug
if lark_bot_url:
self.lark_reporter = LarkReporter(lark_bot_url)
else:
self.lark_reporter = None
def __call__(self, tasks: List[Dict[str, Any]]):
"""Launch multiple tasks and summarize the results.
Args:
tasks (list[dict]): A list of task configs, usually generated by
Partitioner.
"""
status = self.launch(tasks)
self.summarize(status)
@abstractmethod
def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
"""Launch multiple tasks.
Args:
tasks (list[dict]): A list of task configs, usually generated by
Partitioner.
Returns:
list[tuple[str, int]]: A list of (task name, exit code).
"""
def summarize(self, status: List[Tuple[str, int]]) -> None:
"""Summarize the results of the tasks.
Args:
status (list[tuple[str, int]]): A list of (task name, exit code).
"""
failed_logs = []
for _task, code in status:
if code != 0:
get_logger().error(f'{_task} failed with code {code}')
failed_logs.append(_task)
if self.lark_reporter:
num_succeeded = len(status) - len(failed_logs)
if len(failed_logs) > 0:
content = f'{getpass.getuser()} 的 '
content += f'{self.task_cfg.type} 任务已完成,'
content += f'成功任务 {num_succeeded} 个,'
content += f'失败 {len(failed_logs)} 个。以下为失败的任务列表:'
content += '\n' + '\n'.join(failed_logs)
self.lark_reporter.post(title=f'悲报:您有{len(failed_logs)}个'
'任务炸了',
content=content)
else:
content = f'{getpass.getuser()} 的 '
content += f'{self.task_cfg.type} 任务已完成,'
content += f'成功任务 {num_succeeded} 个。'
self.lark_reporter.post(title='喜报:全部任务完成', content=content)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment