merge cnflict resolved

a58674ae · lintangsutawika · 10cc0a56 · f76941ef · a58674ae · a58674ae
Commit a58674ae authored Nov 17, 2023 by lintangsutawika
14 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -94,7 +94,7 @@ class TaskConfig(dict):
    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
    def __post_init__(self) -> None:
-        if "." in self.dataset_path:
+        if self.dataset_path and ("." in self.dataset_path):
            import inspect
            from importlib import import_module
@@ -207,19 +207,9 @@ class Task(abc.ABC):
        self._fewshot_docs = None
        self._instances = None
-        self._config = TaskConfig(**config) if config else TaskConfig()
+        self._config = TaskConfig({**config}) if config else TaskConfig()
-        if not hasattr(self, "_filters"):
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-            self._filters = []
-            for name, components in self._config.get(
-                "filters", [["none", [["take_first", None]]]]
-            ):
-                filter_pipeline = build_filter_ensemble(name, components)
-                self._filters.append(filter_pipeline)
-        self.sampler = samplers.Sampler(
-            list(self.fewshot_docs()), self, rnd=random.Random(1234)
-        )
    def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
        """Downloads and returns the task dataset.
@@ -360,9 +350,7 @@ class Task(abc.ABC):
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
-        eval_logger.info(
+        eval_logger.info(f"Building contexts for task on rank {rank}...")
-            f"Building contexts for task '{self.config.task}' on rank {rank}..."
-        )
        instances = []
        for doc_id, doc in utils.create_iterator(
@@ -452,7 +440,13 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))
    @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot):
+    def fewshot_context(
+        self,
+        doc,
+        num_fewshot,
+        rnd=random.Random(1234),
+        description=None,
+    ):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -460,34 +454,56 @@ class Task(abc.ABC):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
        :returns: str
            The fewshot context.
        """
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+        description = description if description else ""
        if num_fewshot == 0:
-            # always prepend the (possibly empty) task description
+            labeled_examples = ""
-            labeled_examples = self.config.description
        else:
-            labeled_examples = self.config.description + self.sampler.get_context(
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
-                doc, num_fewshot
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
            )
        example = self.doc_to_text(doc)
-        if type(example) == str:
+        return description + labeled_examples + example
-            return labeled_examples + example
-        elif type(example) == list:
-            return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
-            if self.config.doc_to_choice is not None:
-                choices = self.doc_to_choice(doc)
-                return labeled_examples + choices[example]
-            else:
-                return labeled_examples + str(example)
    def apply_filters(self):
        if hasattr(self, "_filters"):
            for f in self._filters:
-                f.apply(self._instances)
+                f.apply(self._instances, None)
        else:
            eval_logger.warning("No filter defined, passing through instances")
            return self._instances
@@ -767,6 +783,39 @@ class ConfigurableTask(Task):
                )
            return super().fewshot_docs()
+    @utils.positional_deprecated
+    def fewshot_context(self, doc, num_fewshot):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :returns: str
+            The fewshot context.
+        """
+        if num_fewshot == 0:
+            # always prepend the (possibly empty) task description
+            labeled_examples = self.config.description
+        else:
+            labeled_examples = self.config.description + self.sampler.get_context(
+                doc, num_fewshot
+            )
+        example = self.doc_to_text(doc)
+        if type(example) == str:
+            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            if self.config.doc_to_choice is not None:
+                choices = self.doc_to_choice(doc)
+                return labeled_examples + choices[example]
+            else:
+                return labeled_examples + str(example)
    def apply_filters(self):
        if hasattr(self, "_filters"):
            for f in self._filters:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -225,6 +225,7 @@ def evaluate(
            versions[group_name] = "N/A"
        else:
+            group_name = None
            task_hierarchy[task_name] = []
        if task is None:
@@ -236,8 +237,10 @@ def evaluate(
        if "task_alias" in configs[task_name]:
            task_group_alias[task_name] = configs[task_name]["task_alias"]
-        if ("group_alias" in configs[task_name]) and (
+        if (
-            group_name not in task_group_alias
+            ("group_alias" in configs[task_name])
+            and (group_name not in task_group_alias)
+            and (group_name is not None)
        ):
            task_group_alias[group_name] = configs[task_name]["group_alias"]
@@ -267,12 +270,9 @@ def evaluate(
                    eval_logger.info(f"Request: {str(inst)}")
        # aggregate Instances by LM method requested to get output.
-        reqtype = (
+        for instance in task.instances:
-            "loglikelihood"
+            reqtype = instance.request_type
-            if task.OUTPUT_TYPE == "multiple_choice"
+            requests[reqtype].append(instance)
-            else task.OUTPUT_TYPE
-        )  # TODO: this is hacky, fix in task.py
-        requests[reqtype].extend(task.instances)
        if lm.world_size > 1:
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,6 +15,17 @@ from lm_eval.api.registry import (
 import logging
+# import python tasks
+from .squadv2.task import SQuAD2
+from .scrolls.task import (
+    QuALITY,
+    NarrativeQA,
+    ContractNLI,
+    GovReport,
+    SummScreenFD,
+    QMSum,
+)
 eval_logger = utils.eval_logger

--- a/lm_eval/tasks/scrolls/README.md
+++ b/lm_eval/tasks/scrolls/README.md
+"""
+SCROLLS: Standardized CompaRison Over Long Language Sequences
+https://arxiv.org/abs/2201.03533
+SCROLLS is a suite of datasets that require synthesizing information over long texts.
+The benchmark includes seven natural language tasks across multiple domains,
+including summarization, question answering, and natural language inference.
+Homepage: https://www.scrolls-benchmark.com/
+Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
+it is possible to create "subset" tasks that contain only those samples whose tokenized length
+is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
+be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
+```
+class QasperGPTNeoX4K(Qasper):
+    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
+    PRUNE_MAX_TOKENS = 4096
+    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
+```
+`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
+less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
+that use different tokenizers but the same maximum sequence length.
+Once the subset task class has been defined in this file, it can be used by adding the class
+to `lm_eval/tasks/__init__.py`.
+NOTE: GovReport may need `max_gen_toks` set larger for causal models.
+"""
--- a/lm_eval/tasks/scrolls/scrolls.yaml
+++ b/lm_eval/tasks/scrolls/scrolls.yaml
+group: scrolls
+task:
+  - scrolls_qasper
+  - scrolls_quality
+  - scrolls_narrativeqa
+  - scrolls_contractnli
+  - scrolls_govreport
+  - scrolls_summscreenfd
+  - scrolls_qmsum
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
+import re
+import numpy as np
+import transformers.data.metrics.squad_metrics as squad_metrics
+from abc import abstractmethod
+from datasets import load_metric
+from transformers import AutoTokenizer
+from functools import reduce
+from lm_eval.api.task import Task
+from lm_eval.api.metrics import mean
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+_CITATION = """
+@inproceedings{shaham-etal-2022-scrolls,
+    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
+    author = "Shaham, Uri  and
+      Segal, Elad  and
+      Ivgi, Maor  and
+      Efrat, Avia  and
+      Yoran, Ori  and
+      Haviv, Adi  and
+      Gupta, Ankit  and
+      Xiong, Wenhan  and
+      Geva, Mor  and
+      Berant, Jonathan  and
+      Levy, Omer",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.823",
+    pages = "12007--12021"
+}
+"""
+# SCROLLS is formualted as a sequence-to-sequence task.
+# To allow for evaluation of causal models, we'll
+# reformualte these with appropriate prompts
+def _download_metric():
+    import os
+    import shutil
+    from huggingface_hub import hf_hub_download
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+def _process_doc_prepended_question(doc):
+    # "When a query is given in addition to the raw text (as
+    # in QMSum, Qasper, NarrativeQA, QuALITY, and ContractNLI),
+    # we prepend it to the text, using two newlines as a natural separator"
+    input = doc["input"]
+    split = input.find("\n\n")
+    return {
+        "id": doc["id"],
+        "pid": doc["pid"],
+        "input": input,
+        "outputs": doc["outputs"],
+        "question": input[0:split],
+        "text": input[split + 2 :],
+    }
+def _drop_duplicates_in_input(untokenized_dataset):
+    # from scrolls/evaluator/dataset_evaluator.py
+    indices_to_keep = []
+    id_to_idx = {}
+    outputs = []
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
+        if id_ in id_to_idx:
+            outputs[id_to_idx[id_]].append(output)
+            continue
+        indices_to_keep.append(i)
+        id_to_idx[id_] = len(outputs)
+        outputs.append([output])
+    untokenized_dataset = untokenized_dataset.select(indices_to_keep).flatten_indices()
+    untokenized_dataset = untokenized_dataset.remove_columns("output")
+    untokenized_dataset = untokenized_dataset.add_column("outputs", outputs)
+    return untokenized_dataset
+def _num_cpu_cores():
+    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
+    try:
+        import psutil
+        return psutil.cpu_count(logical=False)
+    except ImportError:
+        import os
+        return len(os.sched_getaffinity(0))
+class _SCROLLSTask(Task):
+    VERSION = 0
+    DATASET_PATH = "tau/scrolls"
+    DATASET_NAME = None
+    PRUNE_TOKENIZERS = None
+    PRUNE_MAX_TOKENS = None
+    PRUNE_NUM_PROC = None
+    def __post_init__(self):
+        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        for doc in self.dataset["train"]:
+            yield from self._process_doc(doc)
+    def validation_docs(self):
+        for doc in self.dataset["validation"]:
+            yield from self._process_doc(doc)
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["input"]
+    def download(self, *args, **kwargs):
+        super().download(*args, **kwargs)
+        del self.dataset["test"]
+        for split in self.dataset:
+            self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
+        if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None:
+            self.prune()
+    def _get_prune_text(self, sample):
+        return self.doc_to_text(self._process_doc(sample)[0])
+    def prune(self):
+        """Create a pruned version of a SCROLLS task dataset containing only inputs
+        that are less than `max_tokens` when tokenized by each tokenizer
+        """
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
+        cache = {}
+        def _filter(sample):
+            text = self._get_prune_text(sample)
+            cached = cache.get(text, None)
+            if cached is None:
+                for tokenizer in tokenizers:
+                    if len(tokenizer(text).input_ids) > self.PRUNE_MAX_TOKENS:
+                        cache[text] = False
+                        return False
+                cache[text] = True
+                return True
+            else:
+                return cached
+        self.dataset = self.dataset.filter(_filter, num_proc=self.PRUNE_NUM_PROC)
+    def doc_to_target(self, doc):
+        return " " + ", ".join(doc["outputs"])
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
+    def higher_is_better(self):
+        return {x: True for x in self._scrolls_metrics().keys()}
+    @abstractmethod
+    def _scrolls_metrics(self):
+        pass
+    def _make_compute_metrics(self, value):
+        def compute_metrics(samples):
+            predictions, references = zip(*samples)  # unzip, if you will
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
+            return computed[value]
+        return compute_metrics
+    def aggregation(self):
+        return {
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
+        }
+class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
+    def __post_init__(self):
+        self.metric = None
+    def _scrolls_metrics(self):
+        return None
+    def aggregation(self):
+        return {"em": mean, "acc": mean, "acc_norm": mean}
+    def higher_is_better(self):
+        return {"em": True, "acc": True, "acc_norm": True}
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "em": acc_norm * 100.0,
+        }
+    def construct_requests(self, doc, ctx, **kwargs):
+        request_list = [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in doc["choices"]
+        ]
+        return request_list
+class _SCROLLSSummaryTask(_SCROLLSTask):
+    def _process_doc(self, doc):
+        return [doc]
+    def _scrolls_metrics(self):
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }
+    def process_results(self, doc, results):
+        return {
+            "rouge1": (results[0], doc["outputs"]),
+            "rouge2": (results[0], doc["outputs"]),
+            "rougeL": (results[0], doc["outputs"]),
+        }
+    def construct_requests(self, doc, ctx, **kwargs):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+            **kwargs,
+        )
+    def doc_to_text(self, doc):
+        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
+@register_task("scrolls_qasper")
+class Qasper(_SCROLLSTask):
+    """A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
+    https://arxiv.org/abs/2105.03011
+    """
+    DATASET_NAME = "qasper"
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["is_yes_no"] = reduce(
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
+        return [doc]
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+    def process_results(self, doc, results):
+        if doc["is_yes_no"]:
+            prediction = " yes" if results[0] > results[1] else " no"
+        elif len(results[0].strip()) == 0:
+            prediction = "Unanswerable"
+        else:
+            prediction = results[0]
+        return {"f1": (prediction, doc["outputs"])}
+    def construct_requests(self, doc, ctx, **kwargs):
+        if doc["is_yes_no"]:
+            return [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " yes"),
+                    idx=0,
+                    **kwargs,
+                ),
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " no"),
+                    idx=1,
+                    **kwargs,
+                ),
+            ]
+        else:
+            return Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs,
+            )
+@register_task("scrolls_quality")
+class QuALITY(_SCROLLSMultipleChoiceTask):
+    """QuALITY: Question Answering with Long Input Texts, Yes!
+    https://arxiv.org/abs/2112.08608
+    """
+    DATASET_NAME = "quality"
+    _multiple_choice_pattern = re.compile(r" *\([A-D]\) *")
+    @staticmethod
+    def _normalize_answer(text):
+        return " ".join(text.split()).strip()
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        split = doc["text"].find("\n\n", doc["text"].find("(D)"))
+        choices_text = doc["text"][:split]
+        doc["text"] = doc["text"][split:].strip()
+        doc["choices"] = [
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
+        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
+        return [doc]
+@register_task("scrolls_narrativeqa")
+class NarrativeQA(_SCROLLSTask):
+    """The NarrativeQA Reading Comprehension Challenge
+    https://arxiv.org/abs/1712.07040
+    """
+    DATASET_NAME = "narrative_qa"
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+    def _get_prune_text(self, doc):
+        # pruning narrativeqa takes forever -- let's cheat a bit
+        # and just cache on the text, not the question, since
+        # the dataset is different questions about the same large
+        # documents
+        return self._process_doc(doc)[0]["text"]
+    def process_results(self, doc, results):
+        return {"f1": (results[0], doc["outputs"])}
+    def construct_requests(self, doc, ctx, **kwargs):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+            **kwargs,
+        )
+@register_task("scrolls_contractnli")
+class ContractNLI(_SCROLLSMultipleChoiceTask):
+    """ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
+    https://arxiv.org/abs/1712.07040
+    """
+    DATASET_NAME = "contract_nli"
+    CHOICES = ["Not mentioned", "Entailment", "Contradiction"]
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["choices"] = ContractNLI.CHOICES
+        doc["gold"] = ContractNLI.CHOICES.index(doc["outputs"][0])
+        return [doc]
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"
+@register_task("scrolls_govreport")
+class GovReport(_SCROLLSSummaryTask):
+    """Efficient Attentions for Long Document Summarization
+    https://arxiv.org/abs/2104.02112
+    Note: The average length of the reference summaries is ~3,000
+    characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
+    it is recommended to set `max_gen_toks` sufficently large (e.g. 1024)
+    to allow a full summary to be generated.
+    """
+    DATASET_NAME = "gov_report"
+@register_task("scrolls_summscreenfd")
+class SummScreenFD(_SCROLLSSummaryTask):
+    """SummScreen: A Dataset for Abstractive Screenplay Summarization
+    https://arxiv.org/abs/2104.07091
+    """
+    DATASET_NAME = "summ_screen_fd"
+@register_task("scrolls_qmsum")
+class QMSum(_SCROLLSSummaryTask):
+    """QMSum: A New Benchmark for Query-based Multi-domain
+    Meeting Summarization
+    https://arxiv.org/abs/2104.05938
+    """
+    DATASET_NAME = "qmsum"
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -34,12 +34,11 @@ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 #### Groups
-* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+* Not part of a group yet
 #### Tasks
 * `squadv2`: `Default squadv2 task`
-* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
 ### Checklist

--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
-dataset_path: squad_v2
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
-include: _template_yaml
-task: squadv2
-output_type: generate_until
-generation_kwargs:
-  until:
-    - "\n"
-metric_list:
-  - metric: !function utils.exact
-    aggregation: mean
-    higher_is_better: true
-  - metric: !function utils.f1
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
-include: _template_yaml
-task: squadv2_noans_loglikelihood
-output_type: loglikelihood
-doc_to_target: " unanswerable"
-metric_list:
-  - metric: perplexity
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
+"""
+Know What You Don’t Know: Unanswerable Questions for SQuAD
+https://arxiv.org/pdf/1806.03822.pdf
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+"""
+import datasets
+from evaluate import load
+from math import exp
+from functools import partial
+from packaging import version
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+_CITATION = """
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+def _squad_metric(predictions, references):
+    # squad_metric = load("squad_v2")
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+    return _squad_metric(predictions=predictions, references=references).get(key, 0)
+@register_task("squadv2")
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        return self.dataset["train"]
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs
+            ),
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " " + "unanswerable"),
+                idx=0,
+                **kwargs
+            ),
+        ]
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        continuation, (logprob_unanswerable, _) = results
+        no_answer_probability = exp(logprob_unanswerable)
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
-import re
-import string
-import collections
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-    def white_space_fix(text):
-        return " ".join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-# Exact match (the normalized answer exactly match the gold answer)
-def exact(predictions, references):
-    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
-# The F-score of predicted tokens versus the gold answer
-def f1(predictions, references):
-    gold_toks = get_tokens(references[0])
-    pred_toks = get_tokens(predictions[0])
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
-group: squadv2_complete
-task:
-  - squadv2
-  - squadv2_noans_loglikelihood
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -15,7 +15,7 @@ class Test_HFLM:
    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
    generate_until: list[Instance] = generate_until_task.instances
@@ -115,7 +115,7 @@ class Test_HFLM:
    def test_logliklihood_rolling(self) -> None:
        res = self.LM.loglikelihood_rolling(self.ROLLING)
-        assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
    def test_toc_encode(self) -> None:
        res = self.LM.tok_encode("foo bar")