Unverified Commit df5d7cf0 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #229 from EleutherAI/lm_refactor

Refactor LM organization for more reuse
parents 67e2bf8b 9590f366
import abc import abc
import random from typing import Iterable
import numpy as np import numpy as np
import re import re
import os
import json
import hashlib
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean from lm_eval.metrics import mean, weighted_perplexity, weighted_mean
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC): class LM(abc.ABC):
def __init__(self): def __init__(self):
self.cache_hook = CacheHook(None) self.cache_hook = CacheHook(None)
@abc.abstractmethod @abstractmethod
def loglikelihood(self, requests): def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context. """Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other Downstream tasks should attempt to use loglikelihood instead of other
...@@ -34,7 +43,7 @@ class LM(abc.ABC): ...@@ -34,7 +43,7 @@ class LM(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation """Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model. - We will use the full max context length of the model.
...@@ -77,7 +86,7 @@ class LM(abc.ABC): ...@@ -77,7 +86,7 @@ class LM(abc.ABC):
pass pass
# TODO: Add an optional max length # TODO: Add an optional max length
@abc.abstractmethod @abstractmethod
def greedy_until(self, requests): def greedy_until(self, requests):
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
...@@ -96,18 +105,235 @@ class LM(abc.ABC): ...@@ -96,18 +105,235 @@ class LM(abc.ABC):
pass pass
@classmethod @classmethod
def create_from_arg_string(cls, arg_string): def create_from_arg_string(cls, arg_string, additional_config=None):
"""Constructor method, in case models need additional arguments additional_config = {} if additional_config is None else additional_config
e.g. OpenAI API engine, paths for loading, other params args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]): pass
:param arg_string: str @abstractmethod
Left up to individual model class to handle def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
""" """
return cls() inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
def set_cache_hook(self, cache_hook): returns: a torch tensor of shape [batch, sequence, vocab] with the
self.cache_hook = cache_hook logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
class Task(abc.ABC): class Task(abc.ABC):
...@@ -128,17 +354,17 @@ class Task(abc.ABC): ...@@ -128,17 +354,17 @@ class Task(abc.ABC):
"""Downloads the task dataset if necessary""" """Downloads the task dataset if necessary"""
pass pass
@abc.abstractmethod @abstractmethod
def has_training_docs(self): def has_training_docs(self):
"""Whether the task has a training set""" """Whether the task has a training set"""
pass pass
@abc.abstractmethod @abstractmethod
def has_validation_docs(self): def has_validation_docs(self):
"""Whether the task has a validation set""" """Whether the task has a validation set"""
pass pass
@abc.abstractmethod @abstractmethod
def has_test_docs(self): def has_test_docs(self):
"""Whether the task has a test set""" """Whether the task has a test set"""
pass pass
...@@ -170,15 +396,15 @@ class Task(abc.ABC): ...@@ -170,15 +396,15 @@ class Task(abc.ABC):
return rnd.sample(self._training_docs, k) return rnd.sample(self._training_docs, k)
@abc.abstractmethod @abstractmethod
def doc_to_text(self, doc): def doc_to_text(self, doc):
pass pass
@abc.abstractmethod @abstractmethod
def doc_to_target(self, doc): def doc_to_target(self, doc):
pass pass
@abc.abstractmethod @abstractmethod
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
...@@ -192,7 +418,7 @@ class Task(abc.ABC): ...@@ -192,7 +418,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of dict where keys are the names of submetrics and values are the values of
...@@ -205,7 +431,7 @@ class Task(abc.ABC): ...@@ -205,7 +431,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [metric_score] -> float} :returns: {str: [metric_score] -> float}
...@@ -214,7 +440,7 @@ class Task(abc.ABC): ...@@ -214,7 +440,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def higher_is_better(self): def higher_is_better(self):
""" """
:returns: {str: bool} :returns: {str: bool}
...@@ -238,7 +464,9 @@ class Task(abc.ABC): ...@@ -238,7 +464,9 @@ class Task(abc.ABC):
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd) fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else: else:
if self._fewshot_docs is None: if self._fewshot_docs is None:
self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs()) self._fewshot_docs = list(
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1) fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
...@@ -253,7 +481,7 @@ class Task(abc.ABC): ...@@ -253,7 +481,7 @@ class Task(abc.ABC):
return description + labeled_examples + example return description + labeled_examples + example
class MultipleChoiceTask(Task): class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']] return " " + doc['choices'][doc['gold']]
...@@ -328,10 +556,10 @@ class PerplexityTask(Task, abc.ABC): ...@@ -328,10 +556,10 @@ class PerplexityTask(Task, abc.ABC):
def process_results(self, doc, results): def process_results(self, doc, results):
loglikelihood, = results loglikelihood, = results
words = self.count_words(doc) words = self.count_words(doc)
bytes = self.count_bytes(doc) bytes_ = self.count_bytes(doc)
return { return {
"word_perplexity": (loglikelihood, words), "word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes), "byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc)) "bits_per_byte": (-loglikelihood, self.count_bytes(doc))
} }
...@@ -342,25 +570,16 @@ class PerplexityTask(Task, abc.ABC): ...@@ -342,25 +570,16 @@ class PerplexityTask(Task, abc.ABC):
"bits_per_byte": weighted_mean "bits_per_byte": weighted_mean
} }
def count_bytes(self, doc): @classmethod
def count_bytes(cls, doc):
return len(doc.encode("utf-8")) return len(doc.encode("utf-8"))
def count_words(self, doc): @classmethod
def count_words(cls, doc):
""" Downstream tasks with custom word boundaries should override this! """ """ Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc)) return len(re.split(r"\s+", doc))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(attr, args): def hash_args(attr, args):
dat = json.dumps([attr] + list(args)) dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest() return hashlib.sha256(dat.encode('utf-8')).hexdigest()
...@@ -383,9 +602,17 @@ class CacheHook: ...@@ -383,9 +602,17 @@ class CacheHook:
class CachingLM: class CachingLM:
def __init__(self, lm, cache_db): def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm self.lm = lm
self.cache_db = cache_db self.cache_db = cache_db
if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True) self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm # add hook to lm
...@@ -409,13 +636,14 @@ class CachingLM: ...@@ -409,13 +636,14 @@ class CachingLM:
res.append(None) res.append(None)
remaining_reqs.append(req) remaining_reqs.append(req)
# actually run the LM # actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs) rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones # stick the new ones back into the list and also cache any of the new ones
resptr = 0 resptr = 0
for req, r in zip(remaining_reqs, rem_res): for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1 while res[resptr] is not None:
resptr += 1
res[resptr] = r res[resptr] = r
...@@ -431,32 +659,39 @@ class CachingLM: ...@@ -431,32 +659,39 @@ class CachingLM:
return CacheHook(self) return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request: class Request:
def __init__(self, type, args, index=None): def __init__(self, request_type, args, index=None):
if type not in req_ret_lens.keys(): if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(type)) raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.type = type self.request_type = request_type
self.args = args self.args = args
self.index = index self.index = index
def __iter__(self): def __iter__(self):
if req_ret_lens[self.type] is None: if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!') raise IndexError('This request type does not return multiple arguments!')
i = 0 for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
for i in range(req_ret_lens[self.type]): yield Request(self.request_type, self.args, i)
yield Request(self.type, self.args, i)
def __getitem__(self, i): def __getitem__(self, i):
if req_ret_lens[self.type] is None: if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!') raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i) return Request(self.request_type, self.args, i)
def __eq__(self, other): def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self): def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n" return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory: class RequestFactory:
def __getattr__(self, attr): def __getattr__(self, attr):
......
...@@ -7,7 +7,33 @@ import lm_eval.tasks ...@@ -7,7 +7,33 @@ import lm_eval.tasks
import lm_eval.base import lm_eval.base
import numpy as np import numpy as np
def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param model: str
Name of model, see lm_eval.models.get_model
:param model_args: str
String arguments for each model class, see LM.create_from_arg_string
:param task_names: list[str]
List of task names
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int, optional
Batch size for model
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
random.seed(1234) random.seed(1234)
np.random.seed(1234) np.random.seed(1234)
...@@ -16,7 +42,9 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non ...@@ -16,7 +42,9 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
}) })
if not no_cache: if not no_cache:
lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db') lm = lm_eval.base.CachingLM(
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict(task_names) task_dict = lm_eval.tasks.get_task_dict(task_names)
results = evaluate(lm, task_dict, False, num_fewshot, limit) results = evaluate(lm, task_dict, False, num_fewshot, limit)
...@@ -37,11 +65,33 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non ...@@ -37,11 +65,33 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000): def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
assert not provide_description # not implemented. todo: implement proper description-providing system """Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())] # TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict) results = collections.defaultdict(dict)
versions = collections.defaultdict(dict) versions = collections.defaultdict(dict)
...@@ -49,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -49,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
requests = collections.defaultdict(list) requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list) requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory, # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {} docs = {}
# get lists of each type of requeste # get lists of each type of request
for task_name, task in task_dict_items: for task_name, task in task_dict_items:
versions[task_name] = task.VERSION versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable # default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs(): if task.has_test_docs():
task_doc_func = task.test_docs task_doc_func = task.test_docs
elif task.has_validation_docs(): elif task.has_validation_docs():
task_doc_func = task.validation_docs task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func()) task_docs = list(task_doc_func())
...@@ -84,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -84,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
) )
reqs = task.construct_requests(doc, ctx) reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs] if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs): for i, req in enumerate(reqs):
requests[req.type].append(req) requests[req.request_type].append(req)
# i: index in requests for a single task instance # i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs` # doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id)) requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc) # all responses for each (task, doc)
process_res_queue = collections.defaultdict(list) process_res_queue = collections.defaultdict(list)
# execute each type of request # execute each type of request
for reqtype, reqs in requests.items(): for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid # only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other. # solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests") print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs]) resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)] resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]): for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
...@@ -129,7 +182,10 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -129,7 +182,10 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this # so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters) stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None: if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items) results[task_name][metric + "_stderr"] = stderr(items)
...@@ -140,6 +196,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -140,6 +196,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
def make_table(result_dict): def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter from pytablewriter import MarkdownTableWriter, LatexTableWriter
md_writer = MarkdownTableWriter() md_writer = MarkdownTableWriter()
...@@ -152,11 +209,11 @@ def make_table(result_dict): ...@@ -152,11 +209,11 @@ def make_table(result_dict):
for k, dic in result_dict["results"].items(): for k, dic in result_dict["results"].items():
version = result_dict["versions"][k] version = result_dict["versions"][k]
for m, v in dic.items(): for m, v in dic.items():
if m.endswith("_stderr"): continue if m.endswith("_stderr"):
continue
if m + "_stderr" in dic: if m + "_stderr" in dic:
se = dic[m + "_stderr"] se = dic[m + "_stderr"]
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se]) values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else: else:
values.append([k, version, m, '%.4f' % v, '', '']) values.append([k, version, m, '%.4f' % v, '', ''])
...@@ -168,4 +225,4 @@ def make_table(result_dict): ...@@ -168,4 +225,4 @@ def make_table(result_dict):
# todo: make latex table look good # todo: make latex table look good
# print(latex_writer.dumps()) # print(latex_writer.dumps())
return md_writer.dumps() return md_writer.dumps()
\ No newline at end of file
import math import math
from collections import Iterable from collections.abc import Iterable
from pprint import pprint
import numpy as np import numpy as np
import sacrebleu import sacrebleu
...@@ -63,6 +62,7 @@ def acc_all(items): ...@@ -63,6 +62,7 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc return acc
def acc_all_stderr(items): def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question # Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {} question_scoring_dict = {}
...@@ -98,6 +98,7 @@ def weighted_mean(items): ...@@ -98,6 +98,7 @@ def weighted_mean(items):
a, b = zip(*items) a, b = zip(*items)
return sum(a) / sum(b) return sum(a) / sum(b)
def weighted_perplexity(items): def weighted_perplexity(items):
return math.exp(-weighted_mean(items)) return math.exp(-weighted_mean(items))
...@@ -179,12 +180,13 @@ def _sacreformat(refs, preds): ...@@ -179,12 +180,13 @@ def _sacreformat(refs, preds):
return refs, preds return refs, preds
## stderr stuff # stderr stuff
class _bootstrap_internal: class _bootstrap_internal:
def __init__(self, f, n): def __init__(self, f, n):
self.f = f self.f = f
self.n = n self.n = n
def __call__(self, v): def __call__(self, v):
i, xs = v i, xs = v
rnd = random.Random() rnd = random.Random()
...@@ -208,7 +210,9 @@ def bootstrap_stderr(f, xs, iters): ...@@ -208,7 +210,9 @@ def bootstrap_stderr(f, xs, iters):
chunk_size = min(1000, iters) chunk_size = min(1000, iters)
from tqdm import tqdm from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__) print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size): for bootstrap in tqdm(pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
# sample w replacement # sample w replacement
res.extend(bootstrap) res.extend(bootstrap)
......
...@@ -3,6 +3,7 @@ from . import gpt3 ...@@ -3,6 +3,7 @@ from . import gpt3
from . import dummy from . import dummy
MODEL_REGISTRY = { MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM, "gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM, "gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM, "dummy": dummy.DummyLM,
......
import transformers import transformers
import torch import torch
import torch.nn as nn from lm_eval.base import BaseLM
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
class GPT2LM(LM): class HFLM(BaseLM):
MAX_GEN_TOKS = 256
def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1): def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__() super().__init__()
...@@ -19,227 +13,91 @@ class GPT2LM(LM): ...@@ -19,227 +13,91 @@ class GPT2LM(LM):
assert isinstance(batch_size, int) assert isinstance(batch_size, int)
if device: if device:
self.device = torch.device(device) self._device = torch.device(device)
else: else:
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# TODO: update this to be less of a hack once subfolder is fixed in HF # TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device) self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval() self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2 # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder) self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
assert isinstance(self.tokenizer, ( assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast, transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast, transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!" )), "this tokenizer has not been checked for compatibility yet!"
self.VOCAB_SIZE = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
self.EOT_TOKEN_ID = self.tokenizer.eos_token_id
print(self.EOT_TOKEN_ID)
try: if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
self.max_length = self.gpt2.config.n_ctx assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
except AttributeError: self.tokenizer.encode('hello\n\nhello')
# gptneoconfig doesn't have n_ctx apparantly
self.max_length = self.gpt2.config.max_position_embeddings
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
# multithreading and batching # multithreading and batching
gpus = torch.cuda.device_count() self.batch_size_per_gpu = batch_size # todo: adaptive batch size
batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
self.batch_size = batch_size_per_gpu# * gpus
# TODO: fix multi-gpu # TODO: fix multi-gpu
# gpus = torch.cuda.device_count()
# if gpus > 1: # if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2) # self.gpt2 = nn.DataParallel(self.gpt2)
@classmethod @property
def create_from_arg_string(cls, arg_string, additional_config={}): def eot_token_id(self):
args = utils.simple_parse_args_string(arg_string) # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
args2 = {k: v for k, v in additional_config.items() if v is not None} return self.tokenizer.eos_token_id
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer.encode(context, add_special_tokens=False)
continuation_enc = self.tokenizer.encode(continuation, add_special_tokens=False)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer.encode(string, add_special_tokens=False),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
with torch.no_grad():
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
# this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
contlens = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
# cont_toks 4 5 6 7 8 9
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
cont = continuation_enc @property
def max_length(self):
# since in _collate we make sure length is descending, the longest is always the first one. try:
padding_length = padding_length if padding_length is not None else inplen return self.gpt2.config.n_ctx
except AttributeError:
# pad to length # gptneoconfig doesn't have n_ctx apparently
inp = torch.cat([ return self.gpt2.config.max_position_embeddings
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
#last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
answer = (float(logits.sum()), bool(max_equal)) @property
def max_gen_toks(self):
return 256
# partial caching @property
if cache_key is not None: def batch_size(self):
self.cache_hook.add_partial("loglikelihood", cache_key, answer) # TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
res.append(answer) @property
def device(self):
# TODO: fix multi-gpu
return self._device
return reord.get_original(res) def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps): def _model_call(self, inps):
""" """
inps: a torch tensor of shape [batch, sequence] inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the returns: a torch tensor of shape [batch, sequence, vocab] with the
logits retuned from the model logits returned from the model
""" """
return self.gpt2(inps)[0][:, :, :50257] with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def greedy_until(self, requests): def _model_generate(self, context, max_length, eos_token_id):
# TODO: implement fully general `until` that handles untils that are return self.gpt2.generate(
# multiple tokens or that span multiple tokens correctly context,
res = [] max_length=max_length,
eos_token_id=eos_token_id,
def _collate(x): do_sample=False
toks = self.tokenizer.encode(x[0], add_special_tokens=False) )
return (len(toks), x[0])
reord = utils.Reorderer(requests, _collate) # for backwards compatibility
GPT2LM = HFLM
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context, add_special_tokens=False)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
primary_until, = self.tokenizer.encode(until[0], add_special_tokens=False)
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
import os import os
import numpy as np import numpy as np
import transformers import transformers
from lm_eval.base import LM from lm_eval.base import BaseLM
from lm_eval import utils from lm_eval import utils
from tqdm import tqdm from tqdm import tqdm
import time import time
def get_result(response, ctxlen): def get_result(response, ctxlen):
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True is_greedy = True
logprobs = response["logprobs"]["token_logprobs"] logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:]) continuation_logprobs = sum(logprobs[ctxlen:])
...@@ -24,8 +36,11 @@ def get_result(response, ctxlen): ...@@ -24,8 +36,11 @@ def get_result(response, ctxlen):
def oa_completion(**kwargs): def oa_completion(**kwargs):
import openai """ Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3 backoff_time = 3
while True: while True:
try: try:
...@@ -35,11 +50,8 @@ def oa_completion(**kwargs): ...@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
backoff_time *= 1.5 backoff_time *= 1.5
class GPT3LM(LM): class GPT3LM(BaseLM):
MAX_LENGTH = 2048
REQ_CHUNK_SIZE = 20 REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False): def __init__(self, engine, truncate=False):
""" """
...@@ -50,10 +62,12 @@ class GPT3LM(LM): ...@@ -50,10 +62,12 @@ class GPT3LM(LM):
Truncate input if too long (if False and input is too long, throw error) Truncate input if too long (if False and input is too long, throw error)
""" """
super().__init__() super().__init__()
import openai import openai
self.engine = engine self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away # to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.pad_token = "<|endoftext|>"
...@@ -64,53 +78,36 @@ class GPT3LM(LM): ...@@ -64,53 +78,36 @@ class GPT3LM(LM):
# Read from environment variable OPENAI_API_SECRET_KEY # Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"] openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod @property
def create_from_arg_string(cls, arg_string, additional_config={}): def eot_token_id(self):
args = utils.simple_parse_args_string(arg_string) return self.tokenizer.eos_token_id
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2) @property
def max_length(self):
def loglikelihood(self, requests): # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
new_reqs = [] return 2048
for context, continuation in requests:
if context == "": @property
# end of text as context def max_gen_toks(self):
context_enc = [50256] return 256
else:
context_enc = self.tokenizer.encode(context) @property
def batch_size(self):
continuation_enc = self.tokenizer.encode(continuation) # Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
new_reqs.append(((context, continuation), context_enc, continuation_enc))
@property
return self._loglikelihood_tokens(new_reqs) def device(self):
# Isn't used because we override _loglikelihood_tokens
def loglikelihood_rolling(self, requests): raise NotImplementedError()
# TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
def tok_encode(self, string: str):
loglikelihoods = [] return self.tokenizer.encode(string, add_special_tokens=False)
for string, in tqdm(requests):
encoded = self.tokenizer.encode_plus(string)["input_ids"] def tok_decode(self, tokens):
rolling_token_windows = utils.get_rolling_token_windows( return self.tokenizer.decode(tokens)
token_list=encoded,
prefix_token=self.end_of_text_token_id, def _loglikelihood_tokens(self, requests, disable_tqdm=False):
max_seq_len=self.MAX_LENGTH,
context_len=1,
)
string_loglikelihoods = []
for input_tokens, pred_tokens in rolling_token_windows:
block_output = self.get_token_logprobs(
input_tokens=input_tokens,
pred_tokens=pred_tokens,
)
string_loglikelihoods.append(block_output["logprobs"])
string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
loglikelihoods.append(string_loglikelihoods)
return loglikelihoods
def _loglikelihood_tokens(self, requests):
import openai
res = [] res = []
def _collate(x): def _collate(x):
...@@ -118,16 +115,18 @@ class GPT3LM(LM): ...@@ -118,16 +115,18 @@ class GPT3LM(LM):
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't # we care about and so we need some kind of backup for when it isn't
toks = x[1] + x[2] toks = x[1] + x[2]
return (-len(toks), tuple(toks)) return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate) reord = utils.Reorderer(requests, _collate)
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
inps = [] inps = []
ctxlens = [] ctxlens = []
for cache_key, context_enc, continuation_enc in chunk: for cache_key, context_enc, continuation_enc in chunk:
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:] # max_length+1 because the API takes up to 2049 tokens, including the first context token
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH) inp = (context_enc + continuation_enc)[-(self.max_length+1):]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
inps.append(inp) inps.append(inp)
ctxlens.append(ctxlen) ctxlens.append(ctxlen)
...@@ -151,35 +150,14 @@ class GPT3LM(LM): ...@@ -151,35 +150,14 @@ class GPT3LM(LM):
return reord.get_original(res) return reord.get_original(res)
def get_token_logprobs(self, input_tokens, pred_tokens):
pred_start = len(input_tokens) - len(pred_tokens) + 1
# We're going to stitch together the input_tokens and pred_tokens
# In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
assert input_tokens[pred_start:] == pred_tokens[:-1]
token_ids = input_tokens + [pred_tokens[-1]]
response = oa_completion(
engine=self.engine,
prompt=token_ids,
max_tokens=0,
temperature=0.0,
logprobs=0,
echo=True,
)
logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
return {
"logprobs": logprobs,
"positions": positions,
}
def greedy_until(self, requests): def greedy_until(self, requests):
if not requests: return [] if not requests:
import openai return []
res = [] res = []
def _collate(x): def _collate(x):
toks = self.tokenizer.encode(x[0]) toks = self.tok_encode(x[0])
return (len(toks), x[0]) return len(toks), x[0]
reord = utils.Reorderer(requests, _collate) reord = utils.Reorderer(requests, _collate)
...@@ -193,34 +171,43 @@ class GPT3LM(LM): ...@@ -193,34 +171,43 @@ class GPT3LM(LM):
lastuntil = x[1] lastuntil = x[1]
ret.append(x) ret.append(x)
if ret: yield ret, lastuntil if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until` # todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = [] inps = []
for context, _ in chunk: for context, _ in chunk:
context_enc = self.tokenizer.encode(context) context_enc = self.tok_encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):] inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(inp) inps.append(inp)
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=inps, prompt=inps,
max_tokens=self.MAX_GEN_TOKS, max_tokens=self.max_gen_toks,
temperature=0., temperature=0.,
logprobs=10, logprobs=10,
stop=until stop=until,
) )
for resp, (context, until) in zip(response.choices, chunk): for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text'] s = resp['text']
for term in until: for term in until_:
s = s.split(term)[0] s = s.split(term)[0]
# partial caching # partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s) self.cache_hook.add_partial("greedy_until", (context, until_), s)
res.append(s) res.append(s)
return reord.get_original(res) return reord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
import argparse import argparse
import json import json
import numpy as np
import random
import logging import logging
from lm_eval import models, tasks, evaluator, base from lm_eval import tasks, evaluator
logging.getLogger("openai").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True) parser.add_argument('--model', required=True)
...@@ -22,11 +21,10 @@ def parse_args(): ...@@ -22,11 +21,10 @@ def parse_args():
parser.add_argument('--no_cache', action="store_true") parser.add_argument('--no_cache', action="store_true")
return parser.parse_args() return parser.parse_args()
def main():
def main():
args = parse_args() args = parse_args()
assert not args.provide_description # not implemented
assert not args.provide_description # not implemented
if args.limit: if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.") print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
...@@ -36,7 +34,16 @@ def main(): ...@@ -36,7 +34,16 @@ def main():
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
results = evaluator.simple_evaluate(args.model, args.model_args, task_names, args.num_fewshot, args.batch_size, args.device, args.no_cache, args.limit) results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
task_names=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
device=args.device,
no_cache=args.no_cache,
limit=args.limit,
)
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
...@@ -46,8 +53,12 @@ def main(): ...@@ -46,8 +53,12 @@ def main():
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
print(f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}") print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
)
print(evaluator.make_table(results)) print(evaluator.make_table(results))
if __name__ == "__main__": if __name__ == "__main__":
main() main()
...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh: ...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup( setuptools.setup(
name="lm_eval", name="lm_eval",
version="0.0.1", version="0.1.0",
author="Leo Gao", author="Leo Gao",
author_email="lg@eleuther.ai", author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models", description="A framework for evaluating autoregressive language models",
...@@ -20,7 +20,7 @@ setuptools.setup( ...@@ -20,7 +20,7 @@ setuptools.setup(
], ],
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
"black==20.8b1", "black",
"best_download>=0.0.6", "best_download>=0.0.6",
"datasets==1.15.1", "datasets==1.15.1",
"click>=7.1", "click>=7.1",
......
...@@ -10,8 +10,8 @@ import pytest ...@@ -10,8 +10,8 @@ import pytest
# TODO: more fine grained unit tests rather than this big honking integration # TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task): def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db") os.system("rm test_cache.db")
...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task): ...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
...@@ -50,5 +51,5 @@ def test_evaluator(taskname, Task): ...@@ -50,5 +51,5 @@ def test_evaluator(taskname, Task):
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
# check taht caching is working # check that caching is working
assert e1 == e2 assert e1 == e2
import lm_eval.tasks as tasks
import lm_eval.models as models import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest import pytest
import os import os
import json import json
...@@ -10,10 +7,11 @@ import mock ...@@ -10,10 +7,11 @@ import mock
import pickle import pickle
import hashlib import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def mock_completion(**kwargs):
def completion(**kwargs): # Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest() hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl" fname = f"tests/testdata/gpt3_test_{hash}.pkl"
...@@ -21,16 +19,15 @@ def completion(**kwargs): ...@@ -21,16 +19,15 @@ def completion(**kwargs):
with open(fname, 'rb') as fh: with open(fname, 'rb') as fh:
return pickle.load(fh) return pickle.load(fh)
ret = openai.Completion.create(**kwargs) ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, 'wb') as fh: with open(fname, 'wb') as fh:
pickle.dump(ret, fh) pickle.dump(ret, fh)
return ret return ret
os.makedirs("tests/testdata", exist_ok=True) @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3(): def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([ (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'), ('The quick brown fox jumps over the lazy', ' dog'),
...@@ -39,8 +36,8 @@ def test_gpt3(): ...@@ -39,8 +36,8 @@ def test_gpt3():
('The quick brown fox jumps over the lazy', ', lazy fox'), ('The quick brown fox jumps over the lazy', ', lazy fox'),
('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'), ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""),
("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""),
("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""),
...@@ -69,15 +66,18 @@ def test_gpt3(): ...@@ -69,15 +66,18 @@ def test_gpt3():
print([x[0] for x in vals]) print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964] targets = [
-34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3_perplexity(): def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
...@@ -85,7 +85,9 @@ def test_gpt3_perplexity(): ...@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows # Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5 with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] mock_max_length.return_value = 5
tgt = -101.93490880000002 gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
import pytest import pytest
import unittest.mock as mock
import lm_eval.models as models import lm_eval.models as models
...@@ -38,22 +39,31 @@ def test_gpt2(): ...@@ -38,22 +39,31 @@ def test_gpt2():
assert gen == ', lazy fox and they both fall to the ground' assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281] targets = [
-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity(): def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu") gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487]) tgt = sum([
-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
gpt2.max_length = 5 mock_max_length.return_value = 5
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813]) perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([
-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
...@@ -4,13 +4,13 @@ import pytest ...@@ -4,13 +4,13 @@ import pytest
from itertools import islice from itertools import islice
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task): def test_basic_interface(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
#dl = Task.download # dl = task_class.download
#Task.download = MagicMock() # task_class.download = MagicMock()
task = Task() task = task_class()
#Task.download = dl # task_class.download = dl
assert task.has_training_docs() in [True, False] assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False] assert task.has_validation_docs() in [True, False]
...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task): ...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
assert isinstance(task.higher_is_better(), dict) assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys() assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False] for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int) assert isinstance(task.VERSION, int)
# test deterministic docs # test deterministic docs
# (don't test train because it's slow) # (don't test train because it's slow)
task2 = Task() task2 = task_class()
limit = None limit = None
if taskname in ["triviaqa"]: limit = 10000 if taskname in ["triviaqa"]:
limit = 10000
if task.has_validation_docs(): if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit)) arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit)) arr2 = list(islice(task2.validation_docs(), limit))
...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task): ...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2 assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task): def test_documents_and_requests(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
task = Task() task = task_class()
fns = [] fns = []
if task.has_training_docs(): fns.append(task.training_docs) if task.has_training_docs():
if task.has_validation_docs(): fns.append(task.validation_docs) fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels # test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs) # if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns: for fn in fns:
#print(list(islice(fn(), 10))) # print(list(islice(fn(), 10)))
for doc in islice(fn(), 10): for doc in islice(fn(), 10):
txt = task.doc_to_text(doc) txt = task.doc_to_text(doc)
...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task): ...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt) reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request # construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs] if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: for req in reqs:
......
...@@ -25,6 +25,7 @@ def assert_target(name, ob): ...@@ -25,6 +25,7 @@ def assert_target(name, ob):
with open(fname, 'w') as fh: with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True) json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob): def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}" fname = f"tests/testdata/{name}"
if os.path.exists(fname): if os.path.exists(fname):
...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'): ...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
# make sure eval results for a task version are stable # make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task): def test_versions_stable(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')() lm = models.get_model('dummy')()
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task): ...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
for string, in reqs: for string, in reqs:
assert isinstance(string, str) assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task): ...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
def greedy_until(reqs): def greedy_until(reqs):
res = [] res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
for ctx, _ in reqs: for ctx, _ in reqs:
res.append("lol") res.append("lol")
...@@ -97,5 +99,5 @@ def test_versions_stable(taskname, Task): ...@@ -97,5 +99,5 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until lm.greedy_until = greedy_until
limit = None limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) result = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
assert_target(f"{taskname}-v{Task.VERSION}-res", res) assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment