Merge branch 'big-refactor' into big-refactor

5a766ac5 · farzanehnakhaee70 · GitHub · a0c1dbbd · 01cfb2ff · 5a766ac5
Unverified Commit 5a766ac5 authored Jun 13, 2023 by farzanehnakhaee70 Committed by GitHub Jun 13, 2023
17 changed files
--- a/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+++ b/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_ubuntu-irc
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_ubuntu-irc
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+
--- a/lm_eval/tasks/pile/pile_uspto.yaml
+++ b/lm_eval/tasks/pile/pile_uspto.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_uspto
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_uspto
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_wikipedia
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_wikipedia
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+
--- a/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
+++ b/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_youtubesubtitles
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_youtubesubtitles
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
-group: 
-  - piqa_yaml_grp
-task: piqa_yaml
+group:
+  - multiple_choice
+task: piqa
 dataset_path: piqa
 dataset_name: null
 output_type: multiple_choice
@@ -9,7 +9,7 @@ validation_split: validation
 test_split: null
 template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{gold}}" # this will be cast to an int. 
+doc_to_target: "{{gold}}" # this will be cast to an int.
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
-group: 
-  - sciq_yaml_grp
-task: sciq_yaml
+group:
+  - multiple_choice
+task: sciq
 dataset_path: sciq
 dataset_name: null
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-# TODO: we should see how shuffling answer choices affects perf.
 template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{gold}}" # this will be cast to an int. 
+doc_to_target: "{{gold}}" # this will be cast to an int.
 metric_list:
  - metric: acc
    aggregation: mean
@@ -20,4 +19,4 @@ metric_list:
    higher_is_better: true
  - metric: acc_mutual_info
    aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: "default"
+dataset_path: super_glue
+dataset_name: boolq
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{passage}}\nQuestion: {{question}}\nAnswer:"
+doc_to_target: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
--- a/lm_eval/tasks/super_glue/cb/aggregate.py
+++ b/lm_eval/tasks/super_glue/cb/aggregate.py
+import sklearn
+import numpy as np
+
+
+def cb_multi_fi(items):
+    preds, golds = zip(*items)
+    preds = np.array(preds)
+    golds = np.array(golds)
+    f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
+    f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
+    f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
+    avg_f1 = np.mean([f11, f12, f13])
+    return avg_f1
--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: "default"
+dataset_path: super_glue
+dataset_name: cb
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
+doc_to_target: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
+metric_list:
+  - metric: acc
+  - metric: f1
+    aggregation: !function "aggregate.cb_multi_fi"
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
+"""
+TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
+https://arxiv.org/pdf/1705.03551.pdf
+
+TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
+triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
+and independently gathered evidence documents, six per question on average, that provide
+high quality distant supervision for answering the questions.
+
+Homepage: https://nlp.cs.washington.edu/triviaqa/
+"""
+import inspect
+
+# import lm_eval.datasets.triviaqa.triviaqa
+import string
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+from lm_eval.api.metrics import mean
+
+_CITATION = """
+@InProceedings{JoshiTriviaQA2017,
+    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+    month = {July},
+    year = {2017},
+    address = {Vancouver, Canada},
+    publisher = {Association for Computational Linguistics},
+}
+"""
+
+
+@register_task("triviaqa")
+class TriviaQA(Task):
+    VERSION = 1
+    DATASET_PATH = "trivia_qa"  # inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
+    DATASET_NAME = "unfiltered.nocontext"
+
+    OUTPUT_TYPE = "greedy_until"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        raise NotImplementedError()
+
+    def doc_to_text(self, doc):
+        return f"Q: {doc['question']}\nA:"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]
+
+    def doc_to_target(self, doc):
+        return " " + doc["answer"]["value"]
+
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+        return ret
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+                The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+                The context string, generated by fewshot_context. This includes the natural
+                language description, as well as the few shot examples, and the question
+                part of the document for `doc`.
+        """
+        continuation = Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(
+                ctx,
+                {
+                    "until": ["\n", ".", ","],
+                    "do_sample": False,
+                },
+            ),
+            idx=0,
+            **kwargs,
+        )
+        return continuation
+
+    def process_results(self, doc, results):
+        continuation = (
+            results[0]
+            .strip()
+            .lower()
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        list_of_candidates = [
+            alias.lower().translate(str.maketrans("", "", string.punctuation))
+            for alias in self._remove_prefixes(doc["answer"]["aliases"])
+        ]
+        return {"em": float(continuation in list_of_candidates)}
+
+    def aggregation(self):
+        return {
+            "em": mean,
+        }
+
+    def higher_is_better(self):
+        return {"em": True}
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-"""
-Pointer Sentinel Mixture Models
-https://arxiv.org/pdf/1609.07843.pdf
-
-The WikiText language modeling dataset is a collection of over 100 million tokens
-extracted from the set of verified Good and Featured articles on Wikipedia.
-
-NOTE: This `Task` is based on WikiText-2.
-
-Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
-"""
-import re
-
-from lm_eval.api.task import PerplexityTask
-
-from lm_eval.api.register import register_task, register_group
-
-_CITATION = """
-@misc{merity2016pointer,
-    title={Pointer Sentinel Mixture Models},
-    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-    year={2016},
-    eprint={1609.07843},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-
-    return string
-
-
-@register_task("wikitext")
-class WikiText(PerplexityTask):
-    VERSION = "2.0"
-    DATASET_PATH = "EleutherAI/wikitext_document_level"
-    DATASET_NAME = "wikitext-2-raw-v1"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        return map(self._process_doc, self.dataset["train"])
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def test_docs(self):
-        return map(self._process_doc, self.dataset["test"])
-
-    def _process_doc(self, doc):
-        return doc["page"]
-
-    def doc_to_target(self, doc):
-        return wikitext_detokenizer(doc)
-
-    def should_decontaminate(self):
-        return True
-
-    def count_words(self, doc):
-        # count number of words in *original doc before detokenization*
-        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
@@ -33,4 +33,4 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
 - [x] Is in Eval-harness v1.0 ?
 - [x] Has been checked for regression from v1.0?
 - [ ] Has been checked for equivalence with original paper methodology?
- [ ] "Main" checked variant clearly denoted?
\ No newline at end of file
+- [ ] "Main" checked variant clearly denoted?
--- a/lm_eval/tasks/wikitext/preprocess_wikitext.py
+++ b/lm_eval/tasks/wikitext/preprocess_wikitext.py
 import re

+
 def wikitext_detokenizer(doc):
    string = doc["page"]
    # contractions

--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
 group:
-  - wikitext_group
-task: wikitext_yaml
+  - perplexity
+  - loglikelihood_rolling
+task: wikitext
 dataset_path: EleutherAI/wikitext_document_level
 dataset_name: wikitext-2-raw-v1
 output_type: loglikelihood_rolling
@@ -14,11 +15,5 @@ should_decontaminate: true
 doc_to_decontamination_query: "{{page}}"
 metric_list:
  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -157,22 +157,31 @@ def make_table(result_dict):

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
-    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
-    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    md_writer.headers = ["Task", "Version", "Filter", "Metric", "Value", "", "Stderr"]
+    latex_writer.headers = [
+        "Task",
+        "Version",
+        "Filter",
+        "Metric",
+        "Value",
+        "",
+        "Stderr",
+    ]

    values = []

    for k, dic in result_dict["results"].items():
        version = result_dict["versions"][k]
-        for m, v in dic.items():
+        for (mf), v in dic.items():
+            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
                continue

-            if m + "_stderr" in dic:
-                se = dic[m + "_stderr"]
-                values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
+            if m + "_stderr" + "," + f in dic:
+                se = dic[m + "_stderr" + "," + f]
+                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, m, "%.4f" % v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values

--- a/main.py
+++ b/main.py
@@ -5,11 +5,11 @@ import argparse
 import logging

 from lm_eval import evaluator, utils
-from lm_eval.tasks import ALL_TASKS
+from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY
 from lm_eval.logger import eval_logger

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-logger = logging.getLogger("main")
+ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))

 class MultiChoice:
    def __init__(self, choices):
@@ -20,8 +20,10 @@ class MultiChoice:
        for value in values.split(","):
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.warning("{} is not in task list.".format(value))
-                # eval_logger.info(f"{choices} is this")
-
+                eval_logger.info(f"Available tasks to choose:")
+                # for choice in self.choices:
+                    # eval_logger.info(f"    {choice}")
+                eval_logger.info(ALL_TASKS)
        return True

    def __iter__(self):

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -72,7 +72,7 @@ def main():
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    description=description,
+                    # description=description,
                )
                f.write(ctx + "\n")