Commit ff314d62 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness...

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into evaluator-description-option
parents 564e0612 df5d7cf0
import abc
import random
from typing import Iterable
import numpy as np
import re
from lm_eval import tasks
import os
import json
import hashlib
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC):
def __init__(self):
self.cache_hook = CacheHook(None)
@abc.abstractmethod
@abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
......@@ -35,7 +43,7 @@ class LM(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
......@@ -78,7 +86,7 @@ class LM(abc.ABC):
pass
# TODO: Add an optional max length
@abc.abstractmethod
@abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
......@@ -97,18 +105,235 @@ class LM(abc.ABC):
pass
@classmethod
def create_from_arg_string(cls, arg_string):
"""Constructor method, in case models need additional arguments
e.g. OpenAI API engine, paths for loading, other params
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
:param arg_string: str
Left up to individual model class to handle
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]): pass
@abstractmethod
def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
"""
return cls()
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
class Task(abc.ABC):
......@@ -129,17 +354,17 @@ class Task(abc.ABC):
"""Downloads the task dataset if necessary"""
pass
@abc.abstractmethod
@abstractmethod
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
@abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
pass
@abc.abstractmethod
@abstractmethod
def has_test_docs(self):
"""Whether the task has a test set"""
pass
......@@ -171,15 +396,15 @@ class Task(abc.ABC):
return rnd.sample(self._training_docs, k)
@abc.abstractmethod
@abstractmethod
def doc_to_text(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def doc_to_target(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......@@ -193,7 +418,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
......@@ -206,7 +431,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
......@@ -215,7 +440,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def higher_is_better(self):
"""
:returns: {str: bool}
......@@ -243,7 +468,9 @@ class Task(abc.ABC):
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else:
if self._fewshot_docs is None:
self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
self._fewshot_docs = list(
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
......@@ -258,7 +485,7 @@ class Task(abc.ABC):
return description + labeled_examples + example
class MultipleChoiceTask(Task):
class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
......@@ -330,10 +557,10 @@ class PerplexityTask(Task, abc.ABC):
def process_results(self, doc, results):
loglikelihood, = results
words = self.count_words(doc)
bytes = self.count_bytes(doc)
bytes_ = self.count_bytes(doc)
return {
"word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes),
"byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc))
}
......@@ -344,25 +571,16 @@ class PerplexityTask(Task, abc.ABC):
"bits_per_byte": weighted_mean
}
def count_bytes(self, doc):
@classmethod
def count_bytes(cls, doc):
return len(doc.encode("utf-8"))
def count_words(self, doc):
@classmethod
def count_words(cls, doc):
""" Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest()
......@@ -385,9 +603,17 @@ class CacheHook:
class CachingLM:
def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm
self.cache_db = cache_db
if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True)
if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm
......@@ -411,13 +637,14 @@ class CachingLM:
res.append(None)
remaining_reqs.append(req)
# actually run the LM
# actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones
resptr = 0
for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1
while res[resptr] is not None:
resptr += 1
res[resptr] = r
......@@ -433,32 +660,39 @@ class CachingLM:
return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request:
def __init__(self, type, args, index=None):
if type not in req_ret_lens.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(type))
def __init__(self, request_type, args, index=None):
if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.type = type
self.request_type = request_type
self.args = args
self.index = index
def __iter__(self):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
i = 0
for i in range(req_ret_lens[self.type]):
yield Request(self.type, self.args, i)
for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
yield Request(self.request_type, self.args, i)
def __getitem__(self, i):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i)
return Request(self.request_type, self.args, i)
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n"
return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory:
def __getattr__(self, attr):
......
......@@ -8,7 +8,33 @@ import lm_eval.tasks
import lm_eval.base
import numpy as np
def simple_evaluate(model, model_args, task_names, description_path=None, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param model: str
Name of model, see lm_eval.models.get_model
:param model_args: str
String arguments for each model class, see LM.create_from_arg_string
:param task_names: list[str]
List of task names
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int, optional
Batch size for model
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)
......@@ -17,7 +43,9 @@ def simple_evaluate(model, model_args, task_names, description_path=None, num_fe
})
if not no_cache:
lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
lm = lm_eval.base.CachingLM(
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict(task_names)
description_dict = {}
......@@ -44,10 +72,34 @@ def simple_evaluate(model, model_args, task_names, description_path=None, num_fe
return results
def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap_iters=100000):
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
# TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
......@@ -55,23 +107,25 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
# get lists of each type of request
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable
# default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
task_doc_func = task.test_docs
elif task.has_validation_docs():
task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func())
......@@ -90,25 +144,26 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
description=description
)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.type].append(req)
requests[req.request_type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
# TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
......@@ -135,7 +190,10 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
......@@ -146,6 +204,7 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
md_writer = MarkdownTableWriter()
......@@ -158,11 +217,11 @@ def make_table(result_dict):
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"): continue
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else:
values.append([k, version, m, '%.4f' % v, '', ''])
......
import math
from collections import Iterable
from pprint import pprint
from collections.abc import Iterable
import numpy as np
import sacrebleu
......@@ -63,6 +62,7 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
......@@ -98,6 +98,7 @@ def weighted_mean(items):
a, b = zip(*items)
return sum(a) / sum(b)
def weighted_perplexity(items):
return math.exp(-weighted_mean(items))
......@@ -179,12 +180,13 @@ def _sacreformat(refs, preds):
return refs, preds
## stderr stuff
# stderr stuff
class _bootstrap_internal:
def __init__(self, f, n):
self.f = f
self.n = n
def __call__(self, v):
i, xs = v
rnd = random.Random()
......@@ -208,7 +210,9 @@ def bootstrap_stderr(f, xs, iters):
chunk_size = min(1000, iters)
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
for bootstrap in tqdm(pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
# sample w replacement
res.extend(bootstrap)
......
......@@ -3,6 +3,7 @@ from . import gpt3
from . import dummy
MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM,
......
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
from lm_eval.base import BaseLM
class GPT2LM(LM):
MAX_GEN_TOKS = 256
class HFLM(BaseLM):
def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__()
......@@ -19,183 +13,71 @@ class GPT2LM(LM):
assert isinstance(batch_size, int)
if device:
self.device = torch.device(device)
self._device = torch.device(device)
else:
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device)
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
# pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!"
self.VOCAB_SIZE = self.tokenizer.vocab_size
self.EOT_TOKEN_ID = self.tokenizer.eos_token_id
print(self.EOT_TOKEN_ID)
try:
self.max_length = self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparantly
self.max_length = self.gpt2.config.max_position_embeddings
self.vocab_size = self.tokenizer.vocab_size
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
self.tokenizer.encode('hello\n\nhello')
# multithreading and batching
gpus = torch.cuda.device_count()
batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
self.batch_size = batch_size_per_gpu# * gpus
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
# gpus = torch.cuda.device_count()
# if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer.encode(context, add_special_tokens=False)
continuation_enc = self.tokenizer.encode(continuation, add_special_tokens=False)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer.encode(string, add_special_tokens=False),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
with torch.no_grad():
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
# this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
contlens = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
# cont_toks 4 5 6 7 8 9
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad to length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
#last_token_slice = logits[:, -1, :].squeeze(0).tolist()
@property
def max_length(self):
try:
return self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparently
return self.gpt2.config.max_position_embeddings
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
@property
def max_gen_toks(self):
return 256
answer = (float(logits.sum()), bool(max_equal))
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
@property
def device(self):
# TODO: fix multi-gpu
return self._device
res.append(answer)
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
return reord.get_original(res)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps):
"""
......@@ -203,43 +85,19 @@ class GPT2LM(LM):
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits retuned from the model
logits returned from the model
"""
with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0], add_special_tokens=False)
return (len(toks), x[0])
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context, add_special_tokens=False)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
primary_until, = self.tokenizer.encode(until[0], add_special_tokens=False)
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=eos_token_id,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
# for backwards compatibility
GPT2LM = HFLM
import os
import numpy as np
import transformers
from lm_eval.base import LM
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import time
def get_result(response, ctxlen):
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True
logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:])
......@@ -24,8 +36,11 @@ def get_result(response, ctxlen):
def oa_completion(**kwargs):
import openai
""" Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3
while True:
try:
......@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
backoff_time *= 1.5
class GPT3LM(LM):
MAX_LENGTH = 2048
class GPT3LM(BaseLM):
REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False):
"""
......@@ -50,10 +62,12 @@ class GPT3LM(LM):
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
import openai
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
......@@ -64,53 +78,36 @@ class GPT3LM(LM):
# Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
loglikelihoods = []
for string, in tqdm(requests):
encoded = self.tokenizer.encode_plus(string)["input_ids"]
rolling_token_windows = utils.get_rolling_token_windows(
token_list=encoded,
prefix_token=self.end_of_text_token_id,
max_seq_len=self.MAX_LENGTH,
context_len=1,
)
string_loglikelihoods = []
for input_tokens, pred_tokens in rolling_token_windows:
block_output = self.get_token_logprobs(
input_tokens=input_tokens,
pred_tokens=pred_tokens,
)
string_loglikelihoods.append(block_output["logprobs"])
string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
loglikelihoods.append(string_loglikelihoods)
@property
def eot_token_id(self):
return self.tokenizer.eos_token_id
return loglikelihoods
@property
def max_length(self):
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
def _loglikelihood_tokens(self, requests):
import openai
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
res = []
def _collate(x):
......@@ -118,16 +115,18 @@ class GPT3LM(LM):
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate)
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
inps = []
ctxlens = []
for cache_key, context_enc, continuation_enc in chunk:
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
# max_length+1 because the API takes up to 2049 tokens, including the first context token
inp = (context_enc + continuation_enc)[-(self.max_length+1):]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
inps.append(inp)
ctxlens.append(ctxlen)
......@@ -151,35 +150,14 @@ class GPT3LM(LM):
return reord.get_original(res)
def get_token_logprobs(self, input_tokens, pred_tokens):
pred_start = len(input_tokens) - len(pred_tokens) + 1
# We're going to stitch together the input_tokens and pred_tokens
# In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
assert input_tokens[pred_start:] == pred_tokens[:-1]
token_ids = input_tokens + [pred_tokens[-1]]
response = oa_completion(
engine=self.engine,
prompt=token_ids,
max_tokens=0,
temperature=0.0,
logprobs=0,
echo=True,
)
logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
return {
"logprobs": logprobs,
"positions": positions,
}
def greedy_until(self, requests):
if not requests: return []
import openai
if not requests:
return []
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0])
return (len(toks), x[0])
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
......@@ -193,34 +171,43 @@ class GPT3LM(LM):
lastuntil = x[1]
ret.append(x)
if ret: yield ret, lastuntil
if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until`
# todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = []
for context, _ in chunk:
context_enc = self.tokenizer.encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(inp)
response = oa_completion(
engine=self.engine,
prompt=inps,
max_tokens=self.MAX_GEN_TOKS,
max_tokens=self.max_gen_toks,
temperature=0.,
logprobs=10,
stop=until
stop=until,
)
for resp, (context, until) in zip(response.choices, chunk):
for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text']
for term in until:
for term in until_:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
self.cache_hook.add_partial("greedy_until", (context, until_), s)
res.append(s)
return reord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
import argparse
import json
import numpy as np
import random
import logging
from lm_eval import models, tasks, evaluator, base
from lm_eval import tasks, evaluator
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
......@@ -22,9 +21,10 @@ def parse_args():
parser.add_argument('--no_cache', action="store_true")
return parser.parse_args()
def main():
def main():
args = parse_args()
assert not args.provide_description # not implemented
if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
......@@ -35,15 +35,14 @@ def main():
task_names = args.tasks.split(",")
results = evaluator.simple_evaluate(
args.model,
args.model_args,
task_names,
args.description_path,
args.num_fewshot,
args.batch_size,
args.device,
args.no_cache,
args.limit
model=args.model,
model_args=args.model_args,
task_names=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
device=args.device,
no_cache=args.no_cache,
limit=args.limit,
)
dumped = json.dumps(results, indent=2)
......@@ -54,8 +53,12 @@ def main():
with open(args.output_path, "w") as f:
f.write(dumped)
print(f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}")
print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
)
print(evaluator.make_table(results))
if __name__ == "__main__":
main()
......@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup(
name="lm_eval",
version="0.0.1",
version="0.1.0",
author="Leo Gao",
author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models",
......@@ -20,7 +20,7 @@ setuptools.setup(
],
python_requires='>=3.6',
install_requires=[
"black==20.8b1",
"black",
"best_download>=0.0.6",
"datasets==1.15.1",
"click>=7.1",
......
......@@ -10,8 +10,8 @@ import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db")
......@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
......@@ -50,5 +51,5 @@ def test_evaluator(taskname, Task):
e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
# check taht caching is working
# check that caching is working
assert e1 == e2
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
import os
import json
......@@ -10,10 +7,11 @@ import mock
import pickle
import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def completion(**kwargs):
def mock_completion(**kwargs):
# Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl"
......@@ -21,16 +19,15 @@ def completion(**kwargs):
with open(fname, 'rb') as fh:
return pickle.load(fh)
ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, 'wb') as fh:
pickle.dump(ret, fh)
return ret
os.makedirs("tests/testdata", exist_ok=True)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
......@@ -69,15 +66,18 @@ def test_gpt3():
print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
targets = [
-34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
......@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5
with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
mock_max_length.return_value = 5
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.93490880000002
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3)
import pytest
import unittest.mock as mock
import lm_eval.models as models
......@@ -38,22 +39,31 @@ def test_gpt2():
assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
targets = [
-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
tgt = sum([
-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
])
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows
gpt2.max_length = 5
with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
mock_max_length.return_value = 5
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
tgt = sum([
-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
])
assert perplexity == pytest.approx(tgt, rel=1e-3)
......@@ -4,13 +4,13 @@ import pytest
from itertools import islice
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, task_class):
print('Evaluating task', taskname)
#dl = Task.download
#Task.download = MagicMock()
task = Task()
#Task.download = dl
# dl = task_class.download
# task_class.download = MagicMock()
task = task_class()
# task_class.download = dl
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
......@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False]
for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int)
# test deterministic docs
# (don't test train because it's slow)
task2 = Task()
task2 = task_class()
limit = None
if taskname in ["triviaqa"]: limit = 10000
if taskname in ["triviaqa"]:
limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit))
......@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, task_class):
print('Evaluating task', taskname)
task = Task()
task = task_class()
fns = []
if task.has_training_docs(): fns.append(task.training_docs)
if task.has_validation_docs(): fns.append(task.validation_docs)
if task.has_training_docs():
fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs)
# if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns:
#print(list(islice(fn(), 10)))
# print(list(islice(fn(), 10)))
for doc in islice(fn(), 10):
txt = task.doc_to_text(doc)
......@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
......
......@@ -25,6 +25,7 @@ def assert_target(name, ob):
with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}"
if os.path.exists(fname):
......@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
# make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
res = []
random.seed(42)
......@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
for string, in reqs:
assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
res = []
random.seed(42)
......@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
def greedy_until(reqs):
res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
for ctx, _ in reqs:
res.append("lol")
......@@ -97,5 +99,5 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until
limit = None
res = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
assert_target(f"{taskname}-v{Task.VERSION}-res", res)
result = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment