Commit 66bb89e5 authored by FarzanehNakhaee's avatar FarzanehNakhaee
Browse files

Merge branch 'big-refactor' into add-prost-config

parents e8bb77db 070b6b9c
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-copa-t5-prompt
dataset_path: super_glue
dataset_name: copa
training_split: train
......
def convert_choice(choice):
return choice[0].lower() + choice[1:]
def doc_to_text(doc):
# Drop the period
connector = {
"cause": "because",
"effect": "therefore",
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
def doc_to_target(doc):
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences
return " " + convert_choice(correct_choice)
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_name: record
training_split: train
......
group:
- super-glue-lm-eval-v1
task: "wic"
dataset_path: super_glue
dataset_name: wic
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
gold_alias: "{{label}}" # this will be cast to an int.
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
metric_list:
- metric: acc
group:
- super-glue-promptsource
task: "GPT-3-prompt"
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
use_prompt: "promptsource:GPT-3-prompt"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "GPT-3-prompt-with-label"
use_prompt: "promptsource:GPT-3-prompt-with-label"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "affirmation_true_or_false"
use_prompt: "promptsource:affirmation_true_or_false"
def doc_to_text(doc):
return (
"Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
" two sentences above?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["sentence1"][doc["start1"] : doc["end1"]],
)
)
def doc_to_target(doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]])
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-wsc-t5-prompt
dataset_path: super_glue
dataset_name: wsc
training_split: train
......
......@@ -10,10 +10,11 @@ import collections
import importlib.util
import fnmatch
from typing import List, Union
from typing import List, Literal, Union
import gc
import torch
import transformers
from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined
......@@ -22,15 +23,6 @@ from itertools import islice
from lm_eval.logger import eval_logger
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a
......@@ -180,26 +172,6 @@ def make_disjoint_window(pair):
return a[: len(a) - (len(b) - 1)], b
def select_continuation_from_batch_left_padding(
generations: Union[List[List[int]], torch.Tensor], max_context_size: int
):
"""Select the continuation from the batch, removing prompts of different lengths.
Args:
generations (Union[List[List[int]], torch.Tensor]):
A tensor or list-of-lists of shape [batch_size, sequence length].
max_context_size (int):
The size of the biggest context; generations will proceed from that
index.
Example:
PAD PAD Continue : The dog chased the cat [every day of the week]
Riddle me this : The dog chased the cat [yesterday] PAD PAD PAD PAD
Output:
[every day of the week]
[yesterday] PAD PAD PAD PAD
"""
return generations[:, max_context_size:]
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
......@@ -229,6 +201,64 @@ class Reorderer:
return res
class Grouper:
"""
takes an array `arr` and function `fn` and returns a dictionary
with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
objects in `arr` satisfying `key == fn(ob)`.
"""
def __init__(self, arr, fn):
# self.orig_arr = arr
self.size = len(arr)
arr = list(enumerate(arr))
def group_return_dict(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return res
arr = group_return_dict(arr, lambda x: fn(x[1]))
# self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
self.arr = arr
self._grouped = None
def get_grouped(self):
# return the contents but not indices for our grouped dict.
if self._grouped:
return self._grouped
grouped = {}
for key in self.arr.keys():
# drop the index from each element of self.arr
grouped[key] = [y[1] for y in self.arr[key]]
self._grouped = grouped
return grouped
def get_original(self, grouped_dict):
# take in a grouped dictionary with e.g. results for each key listed
# in the same order as the instances in `self.arr`, and
# return the results in the same (single list) order as `self.orig_arr`.
res = [None] * self.size
cov = [False] * self.size
# orig = [None] * self.size
assert grouped_dict.keys() == self.arr.keys()
for key in grouped_dict.keys():
for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
res[ind] = v
cov[ind] = True
# orig[ind] = _
assert all(cov)
# assert orig == self.orig_arr
return res
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
......@@ -339,7 +369,8 @@ def get_git_commit_hash():
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError:
except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None
return git_hash
......@@ -399,7 +430,13 @@ def load_yaml_config(yaml_path):
return yaml_config
def regex_replace(string, pattern, repl, count=0):
"""Implements the `re.sub` function as a custom Jinja filter."""
return re.sub(pattern, repl, string, count=count)
env = Environment(loader=BaseLoader, undefined=StrictUndefined)
env.filters["regex_replace"] = regex_replace
def apply_template(template, doc):
......@@ -416,6 +453,116 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
return islice(raw_iterator, rank, limit, world_size)
def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
"""
Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in
seq2seq models.
"""
assert (
padding_side == "left" or padding_side == "right"
), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
for i, tensor in enumerate(tensors):
tensor = tensor.squeeze(0) # squeeze, in case passed [1, seq] size
tensor_len = tensor.shape[0]
if tensor_len < max_length:
if padding_side == "right":
# right-pad
tensors[i] = torch.cat(
[
tensor, # [seq]
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
],
dim=0,
).unsqueeze(0)
else:
# left-pad
tensors[i] = torch.cat(
[
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
tensor, # [seq]
],
dim=0,
).unsqueeze(0)
else:
tensors[i] = tensor.unsqueeze(0)
return torch.cat(tensors, dim=0)
def clear_torch_cache():
gc.collect()
torch.cuda.empty_cache()
def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16`
_torch_dtype = getattr(torch, dtype)
else:
_torch_dtype = dtype
return _torch_dtype
# Multi-token stopping criteria
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int,
batch_size: int,
):
self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
:, -self.sequence_id_len :
]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if not done:
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
def stop_sequences_criteria(
tokenizer: transformers.PreTrainedTokenizer,
stop_sequences: List[str],
initial_decoder_input_length: int,
batch_size: int,
) -> transformers.StoppingCriteriaList:
return transformers.StoppingCriteriaList(
[
*[
MultiTokenEOSCriteria(
sequence, tokenizer, initial_decoder_input_length, batch_size
)
for sequence in stop_sequences
],
]
)
......@@ -39,9 +39,8 @@ def parse_args():
"If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--use_cache", type=str, default=None)
parser.add_argument("--decontamination_ngrams_path", default=None)
parser.add_argument("--description_dict_path", default=None)
parser.add_argument("--check_integrity", action="store_true")
parser.add_argument("--write_out", action="store_true", default=False)
parser.add_argument("--output_base_path", type=str, default=None)
......@@ -78,12 +77,6 @@ def main():
eval_logger.info(f"Selected Tasks: {task_names}")
# TODO: description_dict?
# description_dict = {}
# if args.description_dict_path:
# with open(args.description_dict_path, "r") as f:
# description_dict = json.load(f)
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
......@@ -92,9 +85,8 @@ def main():
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device,
no_cache=args.no_cache,
use_cache=args.use_cache,
limit=args.limit,
# description_dict=description_dict,
decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity,
write_out=args.write_out,
......@@ -103,8 +95,7 @@ def main():
if results is not None:
samples = results.pop("samples")
dumped = json.dumps(results, indent=2)
dumped = json.dumps(results, indent=2, default=lambda o: str(o))
print(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
......
......@@ -13,12 +13,10 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--output_base_path", required=True)
parser.add_argument("--tasks", default="all_tasks")
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--sets", type=str, default="val") # example: val,test
parser.add_argument("--num_fewshot", type=int, default=1)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--num_examples", type=int, default=1)
parser.add_argument("--description_dict_path", default=None)
return parser.parse_args()
......@@ -32,11 +30,6 @@ def main():
task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names)
# description_dict = {}
# if args.description_dict_path:
# with open(args.description_dict_path, "r") as f:
# description_dict = json.load(f)
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
rnd = random.Random()
......@@ -55,12 +48,6 @@ def main():
docs = join_iters(iters)
# description = (
# description_dict[task_name]
# if description_dict and task_name in description_dict
# else ""
# )
with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in (
zip(range(args.num_examples), docs)
......@@ -72,7 +59,6 @@ def main():
doc=doc,
num_fewshot=args.num_fewshot,
rnd=rnd,
# description=description,
)
f.write(ctx + "\n")
......
......@@ -28,7 +28,9 @@ setuptools.setup(
python_requires=">=3.9",
install_requires=[
"accelerate>=0.18.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"openai>=0.6.4",
......@@ -53,7 +55,7 @@ setuptools.setup(
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
},
)
......@@ -3,17 +3,21 @@ import lm_eval.tasks
import lm_eval.models
def test_description_dict():
def test_description():
seed = 42
num_examples = 1
task_names = ["hellaswag", "winogrande"]
task_names = ["arc_challenge", "lambada"]
description_dict = {
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
"arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name]
rnd = random.Random()
rnd.seed(seed)
......@@ -37,6 +41,5 @@ def test_description_dict():
doc=doc,
num_fewshot=1,
rnd=rnd,
description=description,
)
assert description in ctx
import os
import lm_eval.base as base
# import lm_eval.base as base
import lm_eval.api.registry as registry
import lm_eval.tasks as tasks
import lm_eval.models as models
# import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
......@@ -15,8 +19,10 @@ import pytest
def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db")
lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
# TODO: re-add cachingLM
# os.system("rm test_cache.db")
# lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
lm = registry.get_model("dummy")()
def ll_fn(reqs):
for ctx, cont in reqs:
......@@ -55,7 +61,6 @@ def test_evaluator(taskname, task_class):
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None,
)
e2 = evaluator.evaluate(
lm=lm,
......@@ -63,7 +68,6 @@ def test_evaluator(taskname, task_class):
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None,
)
# check that caching is working
......
import pytest
import lm_eval.metrics as metrics
import lm_eval.api.metrics as metrics
import random
......
import lm_eval.tasks as tasks
import lm_eval.base as base
import pytest
from itertools import islice
......@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
assert isinstance(req, base.Request)
# for req in reqs:
# assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment