Commit 4b133dca authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness into cfsquad

# Conflicts:
#	lm_eval/tasks/squad.py
parents 8de85534 caba51e1
import numpy as np import numpy as np
import json
import random
from .common import HFTask from .common import HFTask
from lm_eval.base import rf from lm_eval.base import rf
from ..metrics import mean from ..metrics import mean
...@@ -40,12 +38,6 @@ class Pubmed_QA(HFTask): ...@@ -40,12 +38,6 @@ class Pubmed_QA(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(doc["final_decision"]) return " {}".format(doc["final_decision"])
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns """ Uses RequestFactory to construct Requests and returns
an iterable of Requests which will be sent to the LM. an iterable of Requests which will be sent to the LM.
......
import os import os
import numpy as np
from best_download import download_file
from lm_eval.base import MultipleChoiceTask, rf
from lm_eval.metrics import mean
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import random from best_download import download_file
from lm_eval.base import MultipleChoiceTask
class QA4MRE(MultipleChoiceTask): class QA4MRE(MultipleChoiceTask):
YEAR = None YEAR = None
...@@ -46,12 +44,6 @@ class QA4MRE(MultipleChoiceTask): ...@@ -46,12 +44,6 @@ class QA4MRE(MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def _convert_standard(self, question): def _convert_standard(self, question):
choices = [i.text for i in question.iter('answer')] choices = [i.text for i in question.iter('answer')]
out_doc = { out_doc = {
......
import json import json
import random
import os import os
from lm_eval.base import Task from lm_eval.base import Task
from ..utils import sh from ..utils import sh
......
...@@ -5,11 +5,6 @@ from lm_eval.base import rf ...@@ -5,11 +5,6 @@ from lm_eval.base import rf
from ..metrics import mean from ..metrics import mean
from . common import HFTask from . common import HFTask
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class each: class each:
def __init__(self, f): def __init__(self, f):
......
import json
import random
import os import os
from lm_eval.base import MultipleChoiceTask, rf from lm_eval.base import MultipleChoiceTask
from ..metrics import mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
class SATAnalogies(MultipleChoiceTask): class SATAnalogies(MultipleChoiceTask):
......
import os import os
import json import json
from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
import zipfile import zipfile
from lm_eval.base import MultipleChoiceTask
from best_download import download_file from best_download import download_file
......
...@@ -17,7 +17,6 @@ def _squad_agg(key, items): ...@@ -17,7 +17,6 @@ def _squad_agg(key, items):
return _squad_metric(predictions=predictions, references=references)[key] return _squad_metric(predictions=predictions, references=references)[key]
class SQuAD2(HFTask): class SQuAD2(HFTask):
DATASET_PATH = "squad_v2" DATASET_PATH = "squad_v2"
DATASET_NAME = None DATASET_NAME = None
......
import json
import random
from lm_eval.base import Task
from ..utils import sh
import csv import csv
from lm_eval.base import Task
class StoryCloze(Task): class StoryCloze(Task):
NEEDS_MANUAL_DL = True NEEDS_MANUAL_DL = True
......
...@@ -4,11 +4,11 @@ To-do: ...@@ -4,11 +4,11 @@ To-do:
- ReCoRD - ReCoRD
""" """
import numpy as np import numpy as np
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from . common import HFTask, yesno from . common import HFTask, yesno
from lm_eval.base import rf from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths from ..metrics import mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize from ..utils import general_detokenize
...@@ -23,7 +23,7 @@ class BoolQ(HFTask): ...@@ -23,7 +23,7 @@ class BoolQ(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -74,7 +74,7 @@ class CommitmentBank(HFTask): ...@@ -74,7 +74,7 @@ class CommitmentBank(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -145,7 +145,7 @@ class Copa(HFTask): ...@@ -145,7 +145,7 @@ class Copa(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -209,7 +209,7 @@ class MultiRC(HFTask): ...@@ -209,7 +209,7 @@ class MultiRC(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -355,7 +355,7 @@ class WordsInContext(HFTask): ...@@ -355,7 +355,7 @@ class WordsInContext(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -412,7 +412,7 @@ class SGWinogradSchemaChallenge(HFTask): ...@@ -412,7 +412,7 @@ class SGWinogradSchemaChallenge(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def training_docs(self): def training_docs(self):
if self.has_training_docs(): if self.has_training_docs():
......
import abc
import json
import random
import os
from collections import Iterable
from pprint import pprint
import pycountry import pycountry
from pprint import pprint
from sacrebleu import sacrebleu from sacrebleu import sacrebleu
import logging
from lm_eval import metrics from lm_eval import metrics
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
...@@ -86,11 +78,14 @@ class GeneralTranslationTask(Task): ...@@ -86,11 +78,14 @@ class GeneralTranslationTask(Task):
} for src, ref in zip(self.src_data, self.ref_data)] } for src, ref in zip(self.src_data, self.ref_data)]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["src"] language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0])
tar_lang = code_to_language(language_codes[1])
return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
# This shows a single target, though there may be multiple targets in a lang test # This shows a single target, though there may be multiple targets in a lang test
return doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0] return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
......
import os import os
import json import json
import random
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from ..metrics import mean from ..metrics import mean
from ..utils import sh from ..utils import sh
class TriviaQA(Task): class TriviaQA(Task):
def download(self): def download(self):
if not os.path.exists('data/triviaqa'): if not os.path.exists('data/triviaqa'):
......
import gzip import gzip
import json import json
import random
import shutil import shutil
from pathlib import Path from pathlib import Path
from best_download import download_file from best_download import download_file
......
import numpy as np from . common import HFTask
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno
class WikiText103(NLP_TASK):
class WikiText103(HFTask):
NLP_PATH = "wikitext" NLP_PATH = "wikitext"
NLP_NAME = "wikitext-103-raw-v1" NLP_NAME = "wikitext-103-raw-v1"
...@@ -66,7 +63,7 @@ class WikiText103(NLP_TASK): ...@@ -66,7 +63,7 @@ class WikiText103(NLP_TASK):
raise NotImplementedError('Evaluation not implemented') raise NotImplementedError('Evaluation not implemented')
class WikiText2(NLP_TASK): class WikiText2(HFTask):
NLP_PATH = "wikitext" NLP_PATH = "wikitext"
NLP_NAME = "wikitext-2-raw-v1" NLP_NAME = "wikitext-2-raw-v1"
......
...@@ -56,10 +56,14 @@ class WinogradSchemaChallenge273(HFTask): ...@@ -56,10 +56,14 @@ class WinogradSchemaChallenge273(HFTask):
# TODO: redo description # TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def fewshot_examples(self, k): def fewshot_examples(self, k, rnd):
# NOTE: `super().fewshot_examples` samples from training docs which are # NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset. # not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k)
if self._fewshot_docs is None:
self._fewshot_docs = list(self.test_docs())
return rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc): def doc_to_text(self, doc):
return self.partial_context(doc, doc["options"][doc["label"]]) return self.partial_context(doc, doc["options"][doc["label"]])
......
import os import os
import re import re
import collections
class ExitCodeError(Exception): class ExitCodeError(Exception):
...@@ -42,6 +43,14 @@ def chunks(iter, n): ...@@ -42,6 +43,14 @@ def chunks(iter, n):
if arr: yield arr if arr: yield arr
def group(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return list(res.values())
def general_detokenize(string): def general_detokenize(string):
string = string.replace(" n't", "n't") string = string.replace(" n't", "n't")
string = string.replace(" )", ")") string = string.replace(" )", ")")
...@@ -50,3 +59,33 @@ def general_detokenize(string): ...@@ -50,3 +59,33 @@ def general_detokenize(string):
string = string.replace(" \"", "\"") string = string.replace(" \"", "\"")
string = re.sub(r" (['.,])", r"\1", string) string = re.sub(r" (['.,])", r"\1", string)
return string return string
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
arr = [
([y[0] for y in x], x[0][1]) for x in arr
]
arr.sort(key=lambda x: fn(x[1]))
self.arr = arr
def get_reordered(self):
return [x[1] for x in self.arr]
def get_original(self, newarr):
res = [None] * self.size
cov = [False] * self.size
for (inds, _), v in zip(self.arr, newarr):
for ind in inds:
res[ind] = v
cov[ind] = True
assert all(cov)
return res
\ No newline at end of file
...@@ -2,8 +2,6 @@ import argparse ...@@ -2,8 +2,6 @@ import argparse
import json import json
import numpy as np import numpy as np
import random import random
import itertools
import collections
import logging import logging
from lm_eval import models, tasks, evaluator, base from lm_eval import models, tasks, evaluator, base
...@@ -35,7 +33,7 @@ def main(): ...@@ -35,7 +33,7 @@ def main():
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.") print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not args.no_cache: if not args.no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = tasks.ALL_TASKS
else: else:
......
import argparse
import json
import numpy as np
import random import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
import random
from lm_eval.base import LM
import transformers import transformers
from lm_eval import tasks, evaluator
from lm_eval.base import LM
class DryrunLM(LM): class DryrunLM(LM):
......
import argparse
import json import json
import numpy as np import numpy as np
import random import random
import itertools
import collections
import logging import logging
from lm_eval import models, tasks, evaluator, base from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING)
......
...@@ -2,7 +2,6 @@ import argparse ...@@ -2,7 +2,6 @@ import argparse
import numpy as np import numpy as np
import os import os
import random import random
from lm_eval import tasks from lm_eval import tasks
from lm_eval.utils import join_iters from lm_eval.utils import join_iters
...@@ -16,14 +15,13 @@ def parse_args(): ...@@ -16,14 +15,13 @@ def parse_args():
parser.add_argument('--provide_description', action="store_true") parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--sets', type=str, default="val") # example: val,test parser.add_argument('--sets', type=str, default="val") # example: val,test
parser.add_argument('--num_fewshot', type=int, default=1) parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_examples', type=int, default=1) parser.add_argument('--num_examples', type=int, default=1)
return parser.parse_args() return parser.parse_args()
def main(): def main():
args = parse_args() args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
...@@ -33,6 +31,8 @@ def main(): ...@@ -33,6 +31,8 @@ def main():
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
rnd = random.Random()
rnd.seed(args.seed)
iters = [] iters = []
...@@ -54,6 +54,7 @@ def main(): ...@@ -54,6 +54,7 @@ def main():
doc=doc, doc=doc,
provide_description=args.provide_description, provide_description=args.provide_description,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
rnd=rnd
) )
f.write(ctx + "\n") f.write(ctx + "\n")
......
import lm_eval.models as models import lm_eval.models as models
import lm_eval.base as base
def test_gpt2(): def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu") gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment