Commit e48b7884 authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness into mhf

# Conflicts:
#	lm_eval/tasks/__init__.py
parents 491283c5 879aabd6
...@@ -305,6 +305,8 @@ class Request: ...@@ -305,6 +305,8 @@ class Request:
def __eq__(self, other): def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index return self.type == other.type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n"
class RequestFactory: class RequestFactory:
def __getattr__(self, attr): def __getattr__(self, attr):
......
...@@ -21,7 +21,8 @@ class DummyLM(LM): ...@@ -21,7 +21,8 @@ class DummyLM(LM):
def greedy_until(self, requests): def greedy_until(self, requests):
res = [] res = []
for _ in requests: for ctx, _ in requests:
res.append("lol") res.append("lol")
assert ctx.strip() != ''
return res return res
...@@ -16,6 +16,8 @@ class GPT2LM(LM): ...@@ -16,6 +16,8 @@ class GPT2LM(LM):
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained) self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained)
self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.pad_token = "<|endoftext|>"
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
@classmethod @classmethod
def create_from_arg_string(cls, arg_string): def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
......
...@@ -37,7 +37,7 @@ def oa_completion(**kwargs): ...@@ -37,7 +37,7 @@ def oa_completion(**kwargs):
class GPT3LM(LM): class GPT3LM(LM):
MAX_LENGTH = 2048 MAX_LENGTH = 2048
REQ_CHUNK_SIZE = 64 REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256 MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False): def __init__(self, engine, truncate=False):
...@@ -52,8 +52,10 @@ class GPT3LM(LM): ...@@ -52,8 +52,10 @@ class GPT3LM(LM):
self.engine = engine self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
# to make the annoying "Using pad_token, but it is not set yet." error go away # to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.pad_token = "<|endoftext|>"
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
self.truncate = truncate self.truncate = truncate
# Read from environment variable OPENAI_API_SECRET_KEY # Read from environment variable OPENAI_API_SECRET_KEY
...@@ -99,24 +101,46 @@ class GPT3LM(LM): ...@@ -99,24 +101,46 @@ class GPT3LM(LM):
return res return res
def greedy_until(self, requests): def greedy_until(self, requests):
if not requests: return []
import openai import openai
res = [] res = []
for context, until in tqdm(requests): def sameuntil_chunks(xs, size):
context_enc = self.tokenizer.encode(context) ret = []
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):] lastuntil = xs[0][1]
ctxlen = len(context_enc) - max(0, len(context_enc) - (self.MAX_LENGTH - self.MAX_GEN_TOKS)) for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret: yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until`
for chunk, until in tqdm(list(sameuntil_chunks(requests, self.REQ_CHUNK_SIZE))):
inps = []
for context, _ in chunk:
context_enc = self.tokenizer.encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
inps.append(inp)
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=[inp], prompt=inps,
max_tokens=self.MAX_GEN_TOKS, max_tokens=self.MAX_GEN_TOKS,
temperature=0., temperature=0.,
logprobs=10, logprobs=10,
stop=until stop=until
) )
res.append(response.choices[0]['text']) for resp in response.choices:
s = resp['text']
for term in until:
s = s.split(term)[0]
res.append(s)
return res return res
...@@ -19,7 +19,7 @@ from . import naturalqs ...@@ -19,7 +19,7 @@ from . import naturalqs
from . import sat from . import sat
from . import arithmetic from . import arithmetic
from . import lambada from . import lambada
from . import race from . import race
from . import piqa from . import piqa
from . import triviaqa from . import triviaqa
from . import pubmedqa from . import pubmedqa
...@@ -30,6 +30,7 @@ from . import translation ...@@ -30,6 +30,7 @@ from . import translation
from . import headqa from . import headqa
from . import mathqa from . import mathqa
from . import ethics from . import ethics
from . import unscramble
######################################## ########################################
# Translation tasks # Translation tasks
...@@ -139,6 +140,13 @@ TASK_REGISTRY = { ...@@ -139,6 +140,13 @@ TASK_REGISTRY = {
**translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks), **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
# chef's selection, mostly wmt20 # chef's selection, mostly wmt20
**translation.create_tasks_from_benchmarks(selected_translation_benchmarks), **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
# Word Scrambling and Manipulation Tasks
"anagrams1": unscramble.Anagrams1,
"anagrams2": unscramble.Anagrams2,
"cycle_letters": unscramble.CycleLetters,
"random_insertion": unscramble.RandomInsertion,
"reversed_words": unscramble.ReversedWords,
} }
......
...@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task): ...@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task):
tar_lang = code_to_language(language_codes[1]) tar_lang = code_to_language(language_codes[1])
return f"Translate these {src_lang} phrases to {tar_lang}." return f"Translate these {src_lang} phrases to {tar_lang}."
# TODO This should be something like
# French: {src_line}
# English: {ref_line}
def fewshot_context(self, doc, num_fewshot, provide_description):
return ""
def __str__(self): def __str__(self):
language_codes = self.sacrebleu_language_pair.split("-") language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0]) src_lang = code_to_language(language_codes[0])
......
import gzip
import json
import random
import shutil
from pathlib import Path
from best_download import download_file
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
def extract_gzip(gz, to):
with gzip.open(gz, 'rb') as fin:
with open(to, 'wb') as fout:
shutil.copyfileobj(fin, fout)
class WordUnscrambleTask(Task):
BASE_PATH = Path("data/unscramble")
FILENAME = None
CHECKSUM = None # SHA256 Checksum.
def __init__(self):
super().__init__()
def download(self):
if not self.BASE_PATH.exists():
Path.mkdir(self.BASE_PATH)
file = self.BASE_PATH / self.FILENAME
if not file.exists():
rawfile = file.parent / (file.name + ".gz")
base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM)
extract_gzip(gz=rawfile, to=file)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def validation_docs(self):
file = self.BASE_PATH / self.FILENAME
return (json.loads(line) for line in open(file).read().splitlines())
def fewshot_description(self):
return "Please unscramble the letters into a word, and write that word:"
def doc_to_text(self, doc):
return doc["context"]
def doc_to_target(self, doc):
return doc["completion"]
def construct_requests(self, doc, ctx):
completion = rf.greedy_until(ctx, ["\n"])
return completion
def process_results(self, doc, results):
pred = results[0]
gold = doc["completion"]
return {
"acc": int(pred == gold)
}
def aggregation(self):
return {
"acc": mean
}
def higher_is_better(self):
return {
"acc": True
}
class Anagrams1(WordUnscrambleTask):
FILENAME = "mid_word_1_anagrams.jsonl"
CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
class Anagrams2(WordUnscrambleTask):
FILENAME = "mid_word_2_anagrams.jsonl"
CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
class CycleLetters(WordUnscrambleTask):
FILENAME = "cycle_letters_in_word.jsonl"
CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
class RandomInsertion(WordUnscrambleTask):
FILENAME = "random_insertion_in_word.jsonl"
CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
class ReversedWords(WordUnscrambleTask):
FILENAME = "reversed_words.jsonl"
CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
...@@ -30,6 +30,9 @@ def main(): ...@@ -30,6 +30,9 @@ def main():
np.random.seed(args.seed) np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args) lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not args.no_cache: if not args.no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
......
...@@ -8,4 +8,5 @@ transformers>=4.1 ...@@ -8,4 +8,5 @@ transformers>=4.1
sqlitedict==1.6.0 sqlitedict==1.6.0
pytablewriter==0.58.0 pytablewriter==0.58.0
sacrebleu==1.5.0 sacrebleu==1.5.0
pycountry==20.7.3 pycountry==20.7.3
\ No newline at end of file numexpr==2.7.2
\ No newline at end of file
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
fewshot_descriptions = [
"foo",
"bar"
]
task = "lambada"
num_fewshot = 0
model = "gpt2"
model_args = ""
limit = None
no_cache = False
class CustomDescTask:
def __init__(self, task, desc):
self.task = task
self.desc = desc
def fewshot_description():
return self.desc
self.task.fewshot_description = fewshot_description
def __getattr__(self, attr):
return getattr(self.task, attr)
def main():
random.seed(42)
np.random.seed(42)
lm = models.get_model(model).create_from_arg_string(model_args)
if limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db')
task_dict = tasks.get_task_dict([task])
for desc in fewshot_descriptions:
custom_task_dict = {k: CustomDescTask(v, desc) for k, v in task_dict.items()}
results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit)
dumped = json.dumps(results, indent=2)
print('Description:', desc)
print(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment