Unverified Commit df5d7cf0 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #229 from EleutherAI/lm_refactor

Refactor LM organization for more reuse
parents 67e2bf8b 9590f366
import abc
import random
from typing import Iterable
import numpy as np
import re
import os
import json
import hashlib
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC):
def __init__(self):
self.cache_hook = CacheHook(None)
@abc.abstractmethod
@abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
......@@ -34,7 +43,7 @@ class LM(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
......@@ -77,7 +86,7 @@ class LM(abc.ABC):
pass
# TODO: Add an optional max length
@abc.abstractmethod
@abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
......@@ -96,18 +105,235 @@ class LM(abc.ABC):
pass
@classmethod
def create_from_arg_string(cls, arg_string):
"""Constructor method, in case models need additional arguments
e.g. OpenAI API engine, paths for loading, other params
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]): pass
:param arg_string: str
Left up to individual model class to handle
@abstractmethod
def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
"""
return cls()
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
class Task(abc.ABC):
......@@ -128,17 +354,17 @@ class Task(abc.ABC):
"""Downloads the task dataset if necessary"""
pass
@abc.abstractmethod
@abstractmethod
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
@abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
pass
@abc.abstractmethod
@abstractmethod
def has_test_docs(self):
"""Whether the task has a test set"""
pass
......@@ -170,15 +396,15 @@ class Task(abc.ABC):
return rnd.sample(self._training_docs, k)
@abc.abstractmethod
@abstractmethod
def doc_to_text(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def doc_to_target(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......@@ -192,7 +418,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
......@@ -205,7 +431,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
......@@ -214,7 +440,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def higher_is_better(self):
"""
:returns: {str: bool}
......@@ -238,7 +464,9 @@ class Task(abc.ABC):
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else:
if self._fewshot_docs is None:
self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
self._fewshot_docs = list(
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
......@@ -253,7 +481,7 @@ class Task(abc.ABC):
return description + labeled_examples + example
class MultipleChoiceTask(Task):
class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
......@@ -328,10 +556,10 @@ class PerplexityTask(Task, abc.ABC):
def process_results(self, doc, results):
loglikelihood, = results
words = self.count_words(doc)
bytes = self.count_bytes(doc)
bytes_ = self.count_bytes(doc)
return {
"word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes),
"byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc))
}
......@@ -342,25 +570,16 @@ class PerplexityTask(Task, abc.ABC):
"bits_per_byte": weighted_mean
}
def count_bytes(self, doc):
@classmethod
def count_bytes(cls, doc):
return len(doc.encode("utf-8"))
def count_words(self, doc):
@classmethod
def count_words(cls, doc):
""" Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest()
......@@ -383,9 +602,17 @@ class CacheHook:
class CachingLM:
def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm
self.cache_db = cache_db
if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True)
if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm
......@@ -409,13 +636,14 @@ class CachingLM:
res.append(None)
remaining_reqs.append(req)
# actually run the LM
# actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones
resptr = 0
for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1
while res[resptr] is not None:
resptr += 1
res[resptr] = r
......@@ -431,32 +659,39 @@ class CachingLM:
return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request:
def __init__(self, type, args, index=None):
if type not in req_ret_lens.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(type))
def __init__(self, request_type, args, index=None):
if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.type = type
self.request_type = request_type
self.args = args
self.index = index
def __iter__(self):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
i = 0
for i in range(req_ret_lens[self.type]):
yield Request(self.type, self.args, i)
for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
yield Request(self.request_type, self.args, i)
def __getitem__(self, i):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i)
return Request(self.request_type, self.args, i)
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n"
return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory:
def __getattr__(self, attr):
......
......@@ -7,7 +7,33 @@ import lm_eval.tasks
import lm_eval.base
import numpy as np
def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param model: str
Name of model, see lm_eval.models.get_model
:param model_args: str
String arguments for each model class, see LM.create_from_arg_string
:param task_names: list[str]
List of task names
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int, optional
Batch size for model
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)
......@@ -16,7 +42,9 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
})
if not no_cache:
lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
lm = lm_eval.base.CachingLM(
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict(task_names)
results = evaluate(lm, task_dict, False, num_fewshot, limit)
......@@ -37,11 +65,33 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
assert not provide_description # not implemented. todo: implement proper description-providing system
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
# TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
......@@ -49,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
# get lists of each type of request
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable
# default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
task_doc_func = task.test_docs
elif task.has_validation_docs():
task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func())
......@@ -84,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.type].append(req)
requests[req.request_type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
# TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
......@@ -129,7 +182,10 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
......@@ -140,6 +196,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
md_writer = MarkdownTableWriter()
......@@ -152,11 +209,11 @@ def make_table(result_dict):
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"): continue
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else:
values.append([k, version, m, '%.4f' % v, '', ''])
......@@ -168,4 +225,4 @@ def make_table(result_dict):
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
\ No newline at end of file
return md_writer.dumps()
import math
from collections import Iterable
from pprint import pprint
from collections.abc import Iterable
import numpy as np
import sacrebleu
......@@ -63,6 +62,7 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
......@@ -98,6 +98,7 @@ def weighted_mean(items):
a, b = zip(*items)
return sum(a) / sum(b)
def weighted_perplexity(items):
return math.exp(-weighted_mean(items))
......@@ -179,12 +180,13 @@ def _sacreformat(refs, preds):
return refs, preds
## stderr stuff
# stderr stuff
class _bootstrap_internal:
def __init__(self, f, n):
self.f = f
self.n = n
def __call__(self, v):
i, xs = v
rnd = random.Random()
......@@ -208,7 +210,9 @@ def bootstrap_stderr(f, xs, iters):
chunk_size = min(1000, iters)
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
for bootstrap in tqdm(pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
# sample w replacement
res.extend(bootstrap)
......
......@@ -3,6 +3,7 @@ from . import gpt3
from . import dummy
MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM,
......
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
from lm_eval.base import BaseLM
class GPT2LM(LM):
MAX_GEN_TOKS = 256
class HFLM(BaseLM):
def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__()
......@@ -19,227 +13,91 @@ class GPT2LM(LM):
assert isinstance(batch_size, int)
if device:
self.device = torch.device(device)
self._device = torch.device(device)
else:
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device)
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
# pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!"
self.VOCAB_SIZE = self.tokenizer.vocab_size
self.EOT_TOKEN_ID = self.tokenizer.eos_token_id
print(self.EOT_TOKEN_ID)
self.vocab_size = self.tokenizer.vocab_size
try:
self.max_length = self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparantly
self.max_length = self.gpt2.config.max_position_embeddings
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
self.tokenizer.encode('hello\n\nhello')
# multithreading and batching
gpus = torch.cuda.device_count()
batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
self.batch_size = batch_size_per_gpu# * gpus
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
# gpus = torch.cuda.device_count()
# if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer.encode(context, add_special_tokens=False)
continuation_enc = self.tokenizer.encode(continuation, add_special_tokens=False)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer.encode(string, add_special_tokens=False),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
with torch.no_grad():
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
# this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
contlens = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
# cont_toks 4 5 6 7 8 9
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad to length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
#last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
@property
def max_length(self):
try:
return self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparently
return self.gpt2.config.max_position_embeddings
answer = (float(logits.sum()), bool(max_equal))
@property
def max_gen_toks(self):
return 256
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
res.append(answer)
@property
def device(self):
# TODO: fix multi-gpu
return self._device
return reord.get_original(res)
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits retuned from the model
logits returned from the model
"""
return self.gpt2(inps)[0][:, :, :50257]
with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0], add_special_tokens=False)
return (len(toks), x[0])
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context, add_special_tokens=False)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
primary_until, = self.tokenizer.encode(until[0], add_special_tokens=False)
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=eos_token_id,
do_sample=False
)
# for backwards compatibility
GPT2LM = HFLM
import os
import numpy as np
import transformers
from lm_eval.base import LM
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import time
def get_result(response, ctxlen):
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True
logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:])
......@@ -24,8 +36,11 @@ def get_result(response, ctxlen):
def oa_completion(**kwargs):
import openai
""" Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3
while True:
try:
......@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
backoff_time *= 1.5
class GPT3LM(LM):
MAX_LENGTH = 2048
class GPT3LM(BaseLM):
REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False):
"""
......@@ -50,10 +62,12 @@ class GPT3LM(LM):
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
import openai
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
......@@ -64,53 +78,36 @@ class GPT3LM(LM):
# Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
loglikelihoods = []
for string, in tqdm(requests):
encoded = self.tokenizer.encode_plus(string)["input_ids"]
rolling_token_windows = utils.get_rolling_token_windows(
token_list=encoded,
prefix_token=self.end_of_text_token_id,
max_seq_len=self.MAX_LENGTH,
context_len=1,
)
string_loglikelihoods = []
for input_tokens, pred_tokens in rolling_token_windows:
block_output = self.get_token_logprobs(
input_tokens=input_tokens,
pred_tokens=pred_tokens,
)
string_loglikelihoods.append(block_output["logprobs"])
string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
loglikelihoods.append(string_loglikelihoods)
return loglikelihoods
def _loglikelihood_tokens(self, requests):
import openai
@property
def eot_token_id(self):
return self.tokenizer.eos_token_id
@property
def max_length(self):
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
res = []
def _collate(x):
......@@ -118,16 +115,18 @@ class GPT3LM(LM):
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate)
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
inps = []
ctxlens = []
for cache_key, context_enc, continuation_enc in chunk:
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
# max_length+1 because the API takes up to 2049 tokens, including the first context token
inp = (context_enc + continuation_enc)[-(self.max_length+1):]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
inps.append(inp)
ctxlens.append(ctxlen)
......@@ -151,35 +150,14 @@ class GPT3LM(LM):
return reord.get_original(res)
def get_token_logprobs(self, input_tokens, pred_tokens):
pred_start = len(input_tokens) - len(pred_tokens) + 1
# We're going to stitch together the input_tokens and pred_tokens
# In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
assert input_tokens[pred_start:] == pred_tokens[:-1]
token_ids = input_tokens + [pred_tokens[-1]]
response = oa_completion(
engine=self.engine,
prompt=token_ids,
max_tokens=0,
temperature=0.0,
logprobs=0,
echo=True,
)
logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
return {
"logprobs": logprobs,
"positions": positions,
}
def greedy_until(self, requests):
if not requests: return []
import openai
if not requests:
return []
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0])
return (len(toks), x[0])
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
......@@ -193,34 +171,43 @@ class GPT3LM(LM):
lastuntil = x[1]
ret.append(x)
if ret: yield ret, lastuntil
if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until`
# todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = []
for context, _ in chunk:
context_enc = self.tokenizer.encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(inp)
response = oa_completion(
engine=self.engine,
prompt=inps,
max_tokens=self.MAX_GEN_TOKS,
max_tokens=self.max_gen_toks,
temperature=0.,
logprobs=10,
stop=until
stop=until,
)
for resp, (context, until) in zip(response.choices, chunk):
for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text']
for term in until:
for term in until_:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
self.cache_hook.add_partial("greedy_until", (context, until_), s)
res.append(s)
return reord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
import argparse
import json
import numpy as np
import random
import logging
from lm_eval import models, tasks, evaluator, base
from lm_eval import tasks, evaluator
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
......@@ -22,11 +21,10 @@ def parse_args():
parser.add_argument('--no_cache', action="store_true")
return parser.parse_args()
def main():
def main():
args = parse_args()
assert not args.provide_description # not implemented
assert not args.provide_description # not implemented
if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
......@@ -36,7 +34,16 @@ def main():
else:
task_names = args.tasks.split(",")
results = evaluator.simple_evaluate(args.model, args.model_args, task_names, args.num_fewshot, args.batch_size, args.device, args.no_cache, args.limit)
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
task_names=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
device=args.device,
no_cache=args.no_cache,
limit=args.limit,
)
dumped = json.dumps(results, indent=2)
......@@ -46,8 +53,12 @@ def main():
with open(args.output_path, "w") as f:
f.write(dumped)
print(f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}")
print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
)
print(evaluator.make_table(results))
if __name__ == "__main__":
main()
......@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup(
name="lm_eval",
version="0.0.1",
version="0.1.0",
author="Leo Gao",
author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models",
......@@ -20,7 +20,7 @@ setuptools.setup(
],
python_requires='>=3.6',
install_requires=[
"black==20.8b1",
"black",
"best_download>=0.0.6",
"datasets==1.15.1",
"click>=7.1",
......
......@@ -10,8 +10,8 @@ import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db")
......@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
......@@ -50,5 +51,5 @@ def test_evaluator(taskname, Task):
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
# check taht caching is working
# check that caching is working
assert e1 == e2
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
import os
import json
......@@ -10,10 +7,11 @@ import mock
import pickle
import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def completion(**kwargs):
def mock_completion(**kwargs):
# Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl"
......@@ -21,16 +19,15 @@ def completion(**kwargs):
with open(fname, 'rb') as fh:
return pickle.load(fh)
ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, 'wb') as fh:
pickle.dump(ret, fh)
return ret
os.makedirs("tests/testdata", exist_ok=True)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
......@@ -39,8 +36,8 @@ def test_gpt3():
('The quick brown fox jumps over the lazy', ', lazy fox'),
('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""),
("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""),
("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""),
......@@ -69,15 +66,18 @@ def test_gpt3():
print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
targets = [
-34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
......@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.93490880000002
with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
mock_max_length.return_value = 5
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3)
import pytest
import unittest.mock as mock
import lm_eval.models as models
......@@ -38,22 +39,31 @@ def test_gpt2():
assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
targets = [
-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
tgt = sum([
-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
])
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows
gpt2.max_length = 5
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
mock_max_length.return_value = 5
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([
-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
])
assert perplexity == pytest.approx(tgt, rel=1e-3)
......@@ -4,13 +4,13 @@ import pytest
from itertools import islice
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, task_class):
print('Evaluating task', taskname)
#dl = Task.download
#Task.download = MagicMock()
task = Task()
#Task.download = dl
# dl = task_class.download
# task_class.download = MagicMock()
task = task_class()
# task_class.download = dl
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
......@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False]
for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int)
# test deterministic docs
# (don't test train because it's slow)
task2 = Task()
task2 = task_class()
limit = None
if taskname in ["triviaqa"]: limit = 10000
if taskname in ["triviaqa"]:
limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit))
......@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, task_class):
print('Evaluating task', taskname)
task = Task()
task = task_class()
fns = []
if task.has_training_docs(): fns.append(task.training_docs)
if task.has_validation_docs(): fns.append(task.validation_docs)
if task.has_training_docs():
fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs)
# if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns:
#print(list(islice(fn(), 10)))
# print(list(islice(fn(), 10)))
for doc in islice(fn(), 10):
txt = task.doc_to_text(doc)
......@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
......
......@@ -25,6 +25,7 @@ def assert_target(name, ob):
with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}"
if os.path.exists(fname):
......@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
# make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task):
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
res = []
random.seed(42)
......@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
for string, in reqs:
assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
res = []
random.seed(42)
......@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
def greedy_until(reqs):
res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
for ctx, _ in reqs:
res.append("lol")
......@@ -97,5 +99,5 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until
limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
assert_target(f"{taskname}-v{Task.VERSION}-res", res)
result = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment