Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into remove-dataset

1f8a8c1d · jon-tow · b4c0275d · b0acb337 · 1f8a8c1d · 1f8a8c1d
Commit 1f8a8c1d authored Jun 11, 2022 by jon-tow
20 changed files
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -51,17 +51,34 @@ class QuAC(Task):
        raise NotImplementedError("QuAC has no test docs.")
    def _process_doc(self, doc):
-        doc["title"] = doc['title'] + ' - ' + doc['section_title']
+        doc["title"] = doc["title"] + " - " + doc["section_title"]
        return doc
    def doc_to_text(self, doc):
-        return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+        return (
+            "TITLE: "
+            + doc["title"]
+            + "\n"
+            + "PARAGRAPH: "
+            + doc["paragraph"]
+            + "\n\n"
+            + "Q: "
+            + doc["question"]
+            + "\n\n"
+            + "A: "
+        )
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["paragraph"]
    def doc_to_target(self, doc):
-        return doc['answer']
+        return doc["answer"]
    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
        :param doc:
@@ -72,7 +89,7 @@ class QuAC(Task):
            part of the document for `doc`.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -85,7 +102,7 @@ class QuAC(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def aggregation(self):
        """
@@ -94,7 +111,7 @@ class QuAC(Task):
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def higher_is_better(self):
        """
@@ -103,4 +120,4 @@ class QuAC(Task):
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
@@ -59,11 +59,19 @@ class SATAnalogies(MultipleChoiceTask):
    def _process_doc(self, doc):
        return {
-            'source': doc['source'],
+            "source": doc["source"],
-            'query': doc['stem'].split(' ')[:2],
+            "query": doc["stem"].split(" ")[:2],
-            'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in doc["choices"]],
+            "choices": [
-            'gold': ['a', 'b', 'c', 'd', 'e'].index(doc['solution'].strip()),
+                "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
+            ],
+            "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
        }
    def doc_to_text(self, doc):
-        return "{} is to {} as".format(*doc['query'])
+        return "{} is to {} as".format(*doc["query"])
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["source"] + "\n" + " ".join(doc["query"])
--- a/lm_eval/tasks/sciq.py
+++ b/lm_eval/tasks/sciq.py
@@ -54,10 +54,10 @@ class SciQ(MultipleChoiceTask):
            doc["distractor3"],
            doc["correct_answer"],
        ]
-        src = doc['support']
+        src = doc["support"]
        out_doc = {
            "source": src,
-            "query": doc['question'],
+            "query": doc["question"],
            "choices": choices,
            "gold": 3,
        }
@@ -65,3 +65,9 @@ class SciQ(MultipleChoiceTask):
    def doc_to_text(self, doc):
        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["source"] + " " + doc["query"]
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
--- a/lm_eval/tasks/translation.py
+++ b/lm_eval/tasks/translation.py
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -43,10 +43,10 @@ class TriviaQA(Task):
        return False
    def training_docs(self):
-        return self.dataset['train']
+        return self.dataset["train"]
    def validation_docs(self):
-        return self.dataset['validation']
+        return self.dataset["validation"]
    def test_docs(self):
        raise NotImplementedError()
@@ -54,8 +54,14 @@ class TriviaQA(Task):
    def doc_to_text(self, doc):
        return f"Question: {doc['question']}\nAnswer:"
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]
    def doc_to_target(self, doc):
-        return " " + doc['answer']['value']
+        return " " + doc["answer"]["value"]
    def _remove_prefixes(self, aliases):
        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
@@ -69,15 +75,13 @@ class TriviaQA(Task):
    def construct_requests(self, doc, ctx):
        ret = []
-        for alias in self._remove_prefixes(doc['answer']['aliases']):
+        for alias in self._remove_prefixes(doc["answer"]["aliases"]):
            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
            ret.append(is_prediction)
        return ret
    def process_results(self, doc, results):
-        return {
+        return {"acc": float(any(results))}
-            "acc": float(any(results))
-        }
    def aggregation(self):
        return {
@@ -85,6 +89,4 @@ class TriviaQA(Task):
        }
    def higher_is_better(self):
-        return {
+        return {"acc": True}
-            "acc": True
-        }
--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -90,6 +90,9 @@ class WikiText(PerplexityTask):
    def doc_to_target(self, doc):
        return wikitext_detokenizer(doc)
+    def should_decontaminate(self):
+        return True
    def count_words(self, doc):
        # count number of words in *original doc before detokenization*
        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
--- a/main.py
+++ b/main.py
--- a/pile_statistics.json
+++ b/pile_statistics.json
--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py