resolved merge conflict

9f518392 · lintangsutawika · 37ccb191 · bf26d979 · 9f518392 · 9f518392
Commit 9f518392 authored Nov 28, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
+import re
+import numpy as np
+import transformers.data.metrics.squad_metrics as squad_metrics
+
+from abc import abstractmethod
+from datasets import load_metric
+from transformers import AutoTokenizer
+from functools import reduce
+
+from lm_eval.api.task import Task
+from lm_eval.api.metrics import mean
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+
+_CITATION = """
+@inproceedings{shaham-etal-2022-scrolls,
+    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
+    author = "Shaham, Uri  and
+      Segal, Elad  and
+      Ivgi, Maor  and
+      Efrat, Avia  and
+      Yoran, Ori  and
+      Haviv, Adi  and
+      Gupta, Ankit  and
+      Xiong, Wenhan  and
+      Geva, Mor  and
+      Berant, Jonathan  and
+      Levy, Omer",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.823",
+    pages = "12007--12021"
+}
+"""
+
+# SCROLLS is formualted as a sequence-to-sequence task.
+# To allow for evaluation of causal models, we'll
+# reformualte these with appropriate prompts
+
+
+def _download_metric():
+    import os
+    import shutil
+    from huggingface_hub import hf_hub_download
+
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+
+
+def _process_doc_prepended_question(doc):
+    # "When a query is given in addition to the raw text (as
+    # in QMSum, Qasper, NarrativeQA, QuALITY, and ContractNLI),
+    # we prepend it to the text, using two newlines as a natural separator"
+    input = doc["input"]
+    split = input.find("\n\n")
+    return {
+        "id": doc["id"],
+        "pid": doc["pid"],
+        "input": input,
+        "outputs": doc["outputs"],
+        "question": input[0:split],
+        "text": input[split + 2 :],
+    }
+
+
+def _drop_duplicates_in_input(untokenized_dataset):
+    # from scrolls/evaluator/dataset_evaluator.py
+
+    indices_to_keep = []
+    id_to_idx = {}
+    outputs = []
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
+        if id_ in id_to_idx:
+            outputs[id_to_idx[id_]].append(output)
+            continue
+        indices_to_keep.append(i)
+        id_to_idx[id_] = len(outputs)
+        outputs.append([output])
+    untokenized_dataset = untokenized_dataset.select(indices_to_keep).flatten_indices()
+    untokenized_dataset = untokenized_dataset.remove_columns("output")
+    untokenized_dataset = untokenized_dataset.add_column("outputs", outputs)
+    return untokenized_dataset
+
+
+def _num_cpu_cores():
+    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
+    try:
+        import psutil
+
+        return psutil.cpu_count(logical=False)
+    except ImportError:
+        import os
+
+        return len(os.sched_getaffinity(0))
+
+
+class _SCROLLSTask(Task):
+    VERSION = 0
+    DATASET_PATH = "tau/scrolls"
+    DATASET_NAME = None
+    PRUNE_TOKENIZERS = None
+    PRUNE_MAX_TOKENS = None
+    PRUNE_NUM_PROC = None
+
+    def __post_init__(self):
+        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        for doc in self.dataset["train"]:
+            yield from self._process_doc(doc)
+
+    def validation_docs(self):
+        for doc in self.dataset["validation"]:
+            yield from self._process_doc(doc)
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["input"]
+
+    def download(self, *args, **kwargs):
+        super().download(*args, **kwargs)
+        del self.dataset["test"]
+        for split in self.dataset:
+            self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
+        if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None:
+            self.prune()
+
+    def _get_prune_text(self, sample):
+        return self.doc_to_text(self._process_doc(sample)[0])
+
+    def prune(self):
+        """Create a pruned version of a SCROLLS task dataset containing only inputs
+        that are less than `max_tokens` when tokenized by each tokenizer
+        """
+
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
+        cache = {}
+
+        def _filter(sample):
+            text = self._get_prune_text(sample)
+            cached = cache.get(text, None)
+            if cached is None:
+                for tokenizer in tokenizers:
+                    if len(tokenizer(text).input_ids) > self.PRUNE_MAX_TOKENS:
+                        cache[text] = False
+                        return False
+                cache[text] = True
+                return True
+            else:
+                return cached
+
+        self.dataset = self.dataset.filter(_filter, num_proc=self.PRUNE_NUM_PROC)
+
+    def doc_to_target(self, doc):
+        return " " + ", ".join(doc["outputs"])
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
+
+    def higher_is_better(self):
+        return {x: True for x in self._scrolls_metrics().keys()}
+
+    @abstractmethod
+    def _scrolls_metrics(self):
+        pass
+
+    def _make_compute_metrics(self, value):
+        def compute_metrics(samples):
+            predictions, references = zip(*samples)  # unzip, if you will
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
+            return computed[value]
+
+        return compute_metrics
+
+    def aggregation(self):
+        return {
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
+        }
+
+
+class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
+    def __post_init__(self):
+        self.metric = None
+
+    def _scrolls_metrics(self):
+        return None
+
+    def aggregation(self):
+        return {"em": mean, "acc": mean, "acc_norm": mean}
+
+    def higher_is_better(self):
+        return {"em": True, "acc": True, "acc_norm": True}
+
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "em": acc_norm * 100.0,
+        }
+
+    def construct_requests(self, doc, ctx, **kwargs):
+
+        request_list = [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in doc["choices"]
+        ]
+        return request_list
+
+
+class _SCROLLSSummaryTask(_SCROLLSTask):
+    def _process_doc(self, doc):
+        return [doc]
+
+    def _scrolls_metrics(self):
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }
+
+    def process_results(self, doc, results):
+        return {
+            "rouge1": (results[0], doc["outputs"]),
+            "rouge2": (results[0], doc["outputs"]),
+            "rougeL": (results[0], doc["outputs"]),
+        }
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+            **kwargs,
+        )
+
+    def doc_to_text(self, doc):
+        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
+
+
+@register_task("scrolls_qasper")
+class Qasper(_SCROLLSTask):
+    """A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
+    https://arxiv.org/abs/2105.03011
+    """
+
+    DATASET_NAME = "qasper"
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["is_yes_no"] = reduce(
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
+        return [doc]
+
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+
+    def process_results(self, doc, results):
+        if doc["is_yes_no"]:
+            prediction = " yes" if results[0] > results[1] else " no"
+        elif len(results[0].strip()) == 0:
+            prediction = "Unanswerable"
+        else:
+            prediction = results[0]
+        return {"f1": (prediction, doc["outputs"])}
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        if doc["is_yes_no"]:
+            return [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " yes"),
+                    idx=0,
+                    **kwargs,
+                ),
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " no"),
+                    idx=1,
+                    **kwargs,
+                ),
+            ]
+        else:
+            return Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs,
+            )
+
+
+@register_task("scrolls_quality")
+class QuALITY(_SCROLLSMultipleChoiceTask):
+    """QuALITY: Question Answering with Long Input Texts, Yes!
+    https://arxiv.org/abs/2112.08608
+    """
+
+    DATASET_NAME = "quality"
+    _multiple_choice_pattern = re.compile(r" *\([A-D]\) *")
+
+    @staticmethod
+    def _normalize_answer(text):
+        return " ".join(text.split()).strip()
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+
+        split = doc["text"].find("\n\n", doc["text"].find("(D)"))
+        choices_text = doc["text"][:split]
+
+        doc["text"] = doc["text"][split:].strip()
+        doc["choices"] = [
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
+        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
+
+        return [doc]
+
+
+@register_task("scrolls_narrativeqa")
+class NarrativeQA(_SCROLLSTask):
+    """The NarrativeQA Reading Comprehension Challenge
+    https://arxiv.org/abs/1712.07040
+    """
+
+    DATASET_NAME = "narrative_qa"
+
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+
+    def _get_prune_text(self, doc):
+        # pruning narrativeqa takes forever -- let's cheat a bit
+        # and just cache on the text, not the question, since
+        # the dataset is different questions about the same large
+        # documents
+        return self._process_doc(doc)[0]["text"]
+
+    def process_results(self, doc, results):
+        return {"f1": (results[0], doc["outputs"])}
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+            **kwargs,
+        )
+
+
+@register_task("scrolls_contractnli")
+class ContractNLI(_SCROLLSMultipleChoiceTask):
+    """ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
+    https://arxiv.org/abs/1712.07040
+    """
+
+    DATASET_NAME = "contract_nli"
+    CHOICES = ["Not mentioned", "Entailment", "Contradiction"]
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["choices"] = ContractNLI.CHOICES
+        doc["gold"] = ContractNLI.CHOICES.index(doc["outputs"][0])
+        return [doc]
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"
+
+
+@register_task("scrolls_govreport")
+class GovReport(_SCROLLSSummaryTask):
+    """Efficient Attentions for Long Document Summarization
+    https://arxiv.org/abs/2104.02112
+
+    Note: The average length of the reference summaries is ~3,000
+    characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
+    it is recommended to set `max_gen_toks` sufficently large (e.g. 1024)
+    to allow a full summary to be generated.
+    """
+
+    DATASET_NAME = "gov_report"
+
+
+@register_task("scrolls_summscreenfd")
+class SummScreenFD(_SCROLLSSummaryTask):
+    """SummScreen: A Dataset for Abstractive Screenplay Summarization
+    https://arxiv.org/abs/2104.07091
+    """
+
+    DATASET_NAME = "summ_screen_fd"
+
+
+@register_task("scrolls_qmsum")
+class QMSum(_SCROLLSSummaryTask):
+    """QMSum: A New Benchmark for Query-based Multi-domain
+    Meeting Summarization
+
+    https://arxiv.org/abs/2104.05938
+    """
+
+    DATASET_NAME = "qmsum"
+
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
--- a/lm_eval/tasks/siqa/README.md
+++ b/lm_eval/tasks/siqa/README.md
+# Social IQA
+
+### Paper
+
+Title: Social IQA: Commonsense Reasoning about Social Interactions
+
+Abstract: https://arxiv.org/abs/1904.09728
+
+> We introduce Social IQa, the first largescale benchmark for commonsense reasoning about social situations. Social IQa contains 38,000 multiple choice questions for probing emotional and social intelligence in a variety of everyday situations (e.g., Q: "Jordan wanted to tell Tracy a secret, so Jordan leaned towards Tracy. Why did Jordan do this?" A: "Make sure no one else could hear"). Through crowdsourcing, we collect commonsense questions along with correct and incorrect answers about social interactions, using a new framework that mitigates stylistic artifacts in incorrect answers by asking workers to provide the right answer to a different but related question. Empirical results show that our benchmark is challenging for existing question-answering models based on pretrained language models, compared to human performance (>20% gap). Notably, we further establish Social IQa as a resource for transfer learning of commonsense knowledge, achieving state-of-the-art performance on multiple commonsense reasoning tasks (Winograd Schemas, COPA).
+
+Homepage: https://allenai.org/data/socialiqa
+
+
+### Citation
+
+```
+@inproceedings{sap2019social,
+  title={Social IQa: Commonsense Reasoning about Social Interactions},
+  author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and Le Bras, Ronan and Choi, Yejin},
+  booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
+  pages={4463--4473},
+  year={2019}
+}
+```
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [X] Is the task an existing benchmark in the literature?
+  * [X] Have you referenced the original paper that introduced the task?
+  * [X] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? The original paper doesn't have an associated implementation, but there is an official entry in [BigBench](https://github.com/google/BIG-bench/tree/main/bigbench/benchmark_tasks/social_iqa). I use the same prompting format as BigBench.
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/siqa/bigbench.yml
+++ b/lm_eval/tasks/siqa/bigbench.yml
+task: social_iqa
+dataset_path: social_i_qa
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "Q: {{context}} {{question}}\nA:"
+target_delimiter: " "
+doc_to_choice: ["{{answerA}}", "{{answerB}}", "{{answerC}}"]
+doc_to_target: "{{label}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -34,12 +34,11 @@ Homepage: https://rajpurkar.github.io/SQuAD-explorer/

 #### Groups

-* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+* Not part of a group yet

 #### Tasks

 * `squadv2`: `Default squadv2 task`
-* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`

 ### Checklist


--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
-dataset_path: squad_v2
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
-include: _template_yaml
-task: squadv2
-output_type: generate_until
-generation_kwargs:
-  until:
-    - "\n"
-metric_list:
-  - metric: !function utils.exact
-    aggregation: mean
-    higher_is_better: true
-  - metric: !function utils.f1
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
-include: _template_yaml
-task: squadv2_noans_loglikelihood
-output_type: loglikelihood
-doc_to_target: " unanswerable"
-metric_list:
-  - metric: perplexity
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
+"""
+Know What You Don’t Know: Unanswerable Questions for SQuAD
+https://arxiv.org/pdf/1806.03822.pdf
+
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+"""
+import datasets
+from evaluate import load
+
+from math import exp
+from functools import partial
+from packaging import version
+
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+
+_CITATION = """
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+
+def _squad_metric(predictions, references):
+    # squad_metric = load("squad_v2")
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+
+
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+
+    return _squad_metric(predictions=predictions, references=references).get(key, 0)
+
+
+@register_task("squadv2")
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs
+            ),
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " " + "unanswerable"),
+                idx=0,
+                **kwargs
+            ),
+        ]
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+
+        continuation, (logprob_unanswerable, _) = results
+
+        no_answer_probability = exp(logprob_unanswerable)
+
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
-import re
-import string
-import collections
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-# Exact match (the normalized answer exactly match the gold answer)
-def exact(predictions, references):
-    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
-
-
-# The F-score of predicted tokens versus the gold answer
-def f1(predictions, references):
-    gold_toks = get_tokens(references[0])
-    pred_toks = get_tokens(predictions[0])
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
-group: squadv2_complete
-task:
-  - squadv2
-  - squadv2_noans_loglikelihood
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import collections
 import importlib.util
 import fnmatch

-from typing import Iterator, List, Literal, Union
+from typing import Iterator, List, Literal, Union, Any, Callable

 import gc
 import torch
@@ -19,7 +19,16 @@ import transformers
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice

-from lm_eval.logger import eval_logger
+import logging
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+eval_logger = logging.getLogger("lm-eval")
+
+SPACING = " " * 47


 def escaped_split(text, sep_char, maxsplit=-1):
@@ -50,7 +59,12 @@ def handle_arg_string(arg):
        return True
    elif arg.lower() == "false":
        return False
-    return arg
+    elif arg.isnumeric():
+        return int(arg)
+    try:
+        return float(arg)
+    except ValueError:
+        return arg


 def simple_parse_args_string(args_string):
@@ -75,6 +89,32 @@ def join_iters(iters):


 def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+    Returns:
+    An iterator that yields chunks of the input iterable.
+
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
    arr = []
    for i, x in enumerate(iter):
        arr.append(x)
@@ -185,7 +225,13 @@ def make_disjoint_window(pair):


 class Reorderer:
-    def __init__(self, arr, fn) -> None:
+    def __init__(self, arr: List[Any], fn: Callable) -> None:
+        """Reorder an array according to some function
+
+        Args:
+            arr (List[Any]): The initial array
+            fn (Callable[[Any], Any]): A function to determine the priority of elements
+        """
        self.size = len(arr)
        arr = list(enumerate(arr))
        arr = group(arr, lambda x: fn(x[1]))
@@ -197,9 +243,22 @@ class Reorderer:
        self.arr = arr

    def get_reordered(self):
+        """Gets the reordered array
+
+        Returns:
+            List[Any]: The reordered array
+        """
        return [x[1] for x in self.arr]

    def get_original(self, newarr):
+        """Restores the original order of a new array based on the old array's order
+
+        Args:
+            newarr (List[Any]): The array to be restored
+
+        Returns:
+            List[Any]: The array restored to the original order
+        """
        res = [None] * self.size
        cov = [False] * self.size

@@ -301,6 +360,10 @@ def make_table(result_dict, column: str = "results"):
    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
        n = str(result_dict["n-shot"][k])
+
+        if "alias" in dic:
+            k = dic.pop("alias")
+
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
@@ -418,7 +481,6 @@ yaml.add_constructor("!function", import_function)


 def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
-
    if yaml_config is None:
        with open(yaml_path, "rb") as file:
            yaml_config = yaml.full_load(file)
@@ -439,7 +501,6 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
        include_path.reverse()
        final_yaml_config = {}
        for path in include_path:
-
            # Assumes that path is a full path.
            # If not found, assume the included yaml
            # is in the same dir as the original yaml
@@ -562,7 +623,14 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        self.done_tracker = [False] * batch_size
        self.sequence = sequence
        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.sequence_id_len = len(self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        self.sequence_id_len = len(self.sequence_ids) + 2
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs) -> bool:
@@ -572,7 +640,6 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        ]

        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-
        for i, done in enumerate(self.done_tracker):
            if not done:
                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,7 @@ promptsource = [
 gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
 anthropic = ["anthropic"]
 openai = ["openai", "tiktoken"]
+vllm = ["vllm"]
 all = [
    "lm_eval[dev]",
    "lm_eval[testing]",
@@ -80,5 +81,6 @@ all = [
    "lm_eval[promptsource]",
    "lm_eval[gptq]",
    "lm_eval[anthropic]",
-    "lm_eval[openai]"
+    "lm_eval[openai]",
+    "lm_eval[vllm]",
 ]
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -4,9 +4,8 @@ import json
 import os
 import random
 from lm_eval import tasks
-from lm_eval.utils import join_iters
-from lm_eval.tasks import include_path
-from lm_eval.logger import eval_logger
+from lm_eval.utils import join_iters, eval_logger
+from lm_eval.tasks import initialize_tasks, include_path

 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"

@@ -25,6 +24,12 @@ def parse_args():
        default=None,
        help="Additional path to include if there are external tasks to include.",
    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
    return parser.parse_args()


@@ -32,6 +37,8 @@ def main():
    args = parse_args()
    np.random.seed(args.seed)

+    initialize_tasks(args.verbosity)
+
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
        include_path(args.include_path)

--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
+import unittest
+from unittest.mock import patch
+import hashlib
+import json
+import os
+import pickle
+from lm_eval.models.gguf import GGUFLM
+
+from lm_eval.api.instance import Instance
+
+base_url = "https://matthoffner-ggml-llm-api.hf.space"
+
+
+def gguf_completion_mock(base_url=None, **kwargs):
+    # Generate a hash from the parameters
+    hash_kwargs = {"base_url": base_url, **kwargs}
+    hash = hashlib.sha256(
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+
+    fname = f"./tests/testdata/gguf_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    else:
+        print("The file does not exist, attempting to write...")
+        if "stop" in kwargs:
+            result = {
+                "choices": [
+                    {
+                        "text": f"generated text until {kwargs['stop']}",
+                        "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
+                        "finish_reason": "length",
+                    }
+                ]
+            }
+        else:
+            # generated with # curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
+            result = {
+                "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
+                "object": "text_completion",
+                "created": 1700511361,
+                "model": "../llama-2-7b.Q8_0.gguf",
+                "choices": [
+                    {
+                        "text": "string(",
+                        "index": 0,
+                        "logprobs": {
+                            "text_offset": [0, 7],
+                            "token_logprobs": [None, -1.033263319857306],
+                            "tokens": [" string", "("],
+                            "top_logprobs": [
+                                None,
+                                {
+                                    "(": -1.033263319857306,
+                                    "[]": -2.6530743779017394,
+                                    ".": -3.0377145947291324,
+                                    "\n": -3.0399156750513976,
+                                    "_": -3.510376089937872,
+                                    " =": -3.6957918347193663,
+                                    ",": -3.9309459866358702,
+                                    " of": -4.2834550083949035,
+                                    '("': -4.322762841112799,
+                                    "()": -4.426229113466925,
+                                },
+                            ],
+                        },
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 2,
+                    "completion_tokens": 1,
+                    "total_tokens": 3,
+                },
+            }
+
+        try:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            print("Writing file at", fname)
+            with open(fname, "wb") as fh:
+                pickle.dump(result, fh)
+            print("File written successfully")
+        except Exception as e:
+            print("File writing failed:", e)
+
+        return result
+
+
+class GGUFLMTest(unittest.TestCase):
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_loglikelihood(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test loglikelihood
+        requests = [
+            Instance(
+                request_type="loglikelihood",
+                doc=args,
+                arguments=args,
+                idx=i,
+            )
+            for i, args in enumerate([("str", "ing"), ("str", "ing")])
+        ]
+        res = lm.loglikelihood(requests)
+
+        # Assert the loglikelihood response is correct
+        expected_res = [(logprob, True) for logprob in [0, 0]]
+        self.assertEqual(res, expected_res)
+
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_generate_until(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test generate_until
+        requests = [
+            Instance(
+                request_type="generate_until",
+                doc={"input": doc},
+                arguments=(doc, {"until": stop}),
+                idx=i,
+            )
+            for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
+        ]
+
+        res = lm.generate_until(requests)
+
+        # Assert the generate_until response is correct
+        expected_res = ["generated text until stop1", "generated text until stop2"]
+        self.assertEqual(res, expected_res)
+
+    # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    # def test_loglikelihood_rolling(self, gguf_completion_mock):
+    #     lm = GGUFLM(base_url)
+
+    #     # Test loglikelihood_rolling
+    #     requests = ["input1", "input2"]
+    #     res = lm.loglikelihood_rolling(requests)
+
+    #     # Assert the loglikelihood_rolling response is correct
+    #     expected_res = [(-1.2345, True), (-1.2345, True)]
+    #     self.assertEqual(res, expected_res)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks
 import sys
 import torch

+tasks.initialize_tasks()
+

 class Test_HFLM:
    torch.use_deterministic_algorithms(True)
@@ -15,7 +17,7 @@ class Test_HFLM:
    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
    generate_until: list[Instance] = generate_until_task.instances
@@ -115,7 +117,7 @@ class Test_HFLM:

    def test_logliklihood_rolling(self) -> None:
        res = self.LM.loglikelihood_rolling(self.ROLLING)
-        assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)

    def test_toc_encode(self) -> None:
        res = self.LM.tok_encode("foo bar")

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
+import pytest
+from typing import List
+from lm_eval.api.instance import Instance
+import lm_eval.tasks as tasks
+import sys
+import torch
+
+
+@pytest.mark.skip(reason="requires CUDA")
+class TEST_VLLM:
+    vllm = pytest.importorskip("vllm")
+    try:
+        from lm_eval.models.vllm_causallms import VLLM
+
+        LM = VLLM(pretrained="EleutherAI/pythia-70m")
+    except ModuleNotFoundError:
+        pass
+    torch.use_deterministic_algorithms(True)
+    tasks.initialize_tasks()
+    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until: List[Instance] = generate_until_task.instances
+    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: List[Instance] = rolling_task.instances
+
+    # TODO: make proper tests
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        assert len(res) == len(self.MULTIPLE_CH)
+        for x in res:
+            assert isinstance(x[0], float)
+
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert len(res) == len(self.generate_until)
+        for x in res:
+            assert isinstance(x, str)
+
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        for x in res:
+            assert isinstance(x, float)
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -11,6 +11,7 @@ from typing import List
 import random
 import pytest

+tasks.initialize_tasks()

 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -4,7 +4,7 @@ from .utils import new_tasks
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask

-
+tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]


--- a/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
+++ b/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
--- a/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
+++ b/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl