Commit 4b133dca authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness into cfsquad

# Conflicts:
#	lm_eval/tasks/squad.py
parents 8de85534 caba51e1
import numpy as np
import json
import random
from .common import HFTask
from .common import HFTask
from lm_eval.base import rf
from ..metrics import mean
......@@ -40,12 +38,6 @@ class Pubmed_QA(HFTask):
def doc_to_target(self, doc):
return " {}".format(doc["final_decision"])
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns
an iterable of Requests which will be sent to the LM.
......
import os
import numpy as np
from best_download import download_file
from lm_eval.base import MultipleChoiceTask, rf
from lm_eval.metrics import mean
import xml.etree.ElementTree as ET
import random
from best_download import download_file
from lm_eval.base import MultipleChoiceTask
class QA4MRE(MultipleChoiceTask):
YEAR = None
......@@ -46,12 +44,6 @@ class QA4MRE(MultipleChoiceTask):
def has_test_docs(self):
return True
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def _convert_standard(self, question):
choices = [i.text for i in question.iter('answer')]
out_doc = {
......
import json
import random
import os
from lm_eval.base import Task
from ..utils import sh
......
......@@ -5,11 +5,6 @@ from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class each:
def __init__(self, f):
......
import json
import random
import os
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
from lm_eval.base import MultipleChoiceTask
class SATAnalogies(MultipleChoiceTask):
......
import os
import json
from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
import zipfile
from lm_eval.base import MultipleChoiceTask
from best_download import download_file
......
......@@ -17,7 +17,6 @@ def _squad_agg(key, items):
return _squad_metric(predictions=predictions, references=references)[key]
class SQuAD2(HFTask):
DATASET_PATH = "squad_v2"
DATASET_NAME = None
......
import json
import random
from lm_eval.base import Task
from ..utils import sh
import csv
from lm_eval.base import Task
class StoryCloze(Task):
NEEDS_MANUAL_DL = True
......
......@@ -4,11 +4,11 @@ To-do:
- ReCoRD
"""
import numpy as np
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from . common import HFTask, yesno
from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize
......@@ -23,7 +23,7 @@ class BoolQ(HFTask):
return True
def has_test_docs(self):
return True
return False
def fewshot_description(self):
# TODO: figure out actual description
......@@ -74,7 +74,7 @@ class CommitmentBank(HFTask):
return True
def has_test_docs(self):
return True
return False
def fewshot_description(self):
# TODO: figure out actual description
......@@ -145,7 +145,7 @@ class Copa(HFTask):
return True
def has_test_docs(self):
return True
return False
def fewshot_description(self):
# TODO: figure out actual description
......@@ -209,7 +209,7 @@ class MultiRC(HFTask):
return True
def has_test_docs(self):
return True
return False
def fewshot_description(self):
# TODO: figure out actual description
......@@ -355,7 +355,7 @@ class WordsInContext(HFTask):
return True
def has_test_docs(self):
return True
return False
def fewshot_description(self):
# TODO: figure out actual description
......@@ -412,7 +412,7 @@ class SGWinogradSchemaChallenge(HFTask):
return True
def has_test_docs(self):
return True
return False
def training_docs(self):
if self.has_training_docs():
......
import abc
import json
import random
import os
from collections import Iterable
from pprint import pprint
import pycountry
from pprint import pprint
from sacrebleu import sacrebleu
import logging
from lm_eval import metrics
from lm_eval.base import Task, rf
......@@ -86,11 +78,14 @@ class GeneralTranslationTask(Task):
} for src, ref in zip(self.src_data, self.ref_data)]
def doc_to_text(self, doc):
return doc["src"]
language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0])
tar_lang = code_to_language(language_codes[1])
return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:"
def doc_to_target(self, doc):
# This shows a single target, though there may be multiple targets in a lang test
return doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
......
import os
import json
import random
from lm_eval.base import Task, rf
from ..metrics import mean
from ..utils import sh
class TriviaQA(Task):
def download(self):
if not os.path.exists('data/triviaqa'):
......
import gzip
import json
import random
import shutil
from pathlib import Path
from best_download import download_file
......
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . common import HFTask
class WikiText103(NLP_TASK):
class WikiText103(HFTask):
NLP_PATH = "wikitext"
NLP_NAME = "wikitext-103-raw-v1"
......@@ -66,7 +63,7 @@ class WikiText103(NLP_TASK):
raise NotImplementedError('Evaluation not implemented')
class WikiText2(NLP_TASK):
class WikiText2(HFTask):
NLP_PATH = "wikitext"
NLP_NAME = "wikitext-2-raw-v1"
......
......@@ -56,10 +56,14 @@ class WinogradSchemaChallenge273(HFTask):
# TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def fewshot_examples(self, k):
def fewshot_examples(self, k, rnd):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k)
if self._fewshot_docs is None:
self._fewshot_docs = list(self.test_docs())
return rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc):
return self.partial_context(doc, doc["options"][doc["label"]])
......
import os
import re
import collections
class ExitCodeError(Exception):
......@@ -42,6 +43,14 @@ def chunks(iter, n):
if arr: yield arr
def group(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return list(res.values())
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
......@@ -49,4 +58,34 @@ def general_detokenize(string):
string = string.replace("\" ", "\"")
string = string.replace(" \"", "\"")
string = re.sub(r" (['.,])", r"\1", string)
return string
\ No newline at end of file
return string
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
arr = [
([y[0] for y in x], x[0][1]) for x in arr
]
arr.sort(key=lambda x: fn(x[1]))
self.arr = arr
def get_reordered(self):
return [x[1] for x in self.arr]
def get_original(self, newarr):
res = [None] * self.size
cov = [False] * self.size
for (inds, _), v in zip(self.arr, newarr):
for ind in inds:
res[ind] = v
cov[ind] = True
assert all(cov)
return res
\ No newline at end of file
......@@ -2,8 +2,6 @@ import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
......@@ -35,7 +33,7 @@ def main():
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not args.no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
......
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
import random
from lm_eval.base import LM
import transformers
from lm_eval import tasks, evaluator
from lm_eval.base import LM
class DryrunLM(LM):
......
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
......
......@@ -2,7 +2,6 @@ import argparse
import numpy as np
import os
import random
from lm_eval import tasks
from lm_eval.utils import join_iters
......@@ -16,14 +15,13 @@ def parse_args():
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--sets', type=str, default="val") # example: val,test
parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_examples', type=int, default=1)
return parser.parse_args()
def main():
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
if args.tasks == "all_tasks":
......@@ -33,6 +31,8 @@ def main():
task_dict = tasks.get_task_dict(task_names)
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
rnd = random.Random()
rnd.seed(args.seed)
iters = []
......@@ -54,6 +54,7 @@ def main():
doc=doc,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
rnd=rnd
)
f.write(ctx + "\n")
......
import lm_eval.models as models
import lm_eval.base as base
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment