Merge pull request #467 from EleutherAI/configurable-mcqa-ppl

[Refactor, WIP] Multiple Choice + loglikelihood_rolling support for YAML tasks

Merge pull request #467 from EleutherAI/configurable-mcqa-ppl
[Refactor, WIP] Multiple Choice + loglikelihood_rolling support for YAML tasks
1f23061b · Hailey Schoelkopf · GitHub · e7f49cca · 337419b8 · 1f23061b
Unverified Commit 1f23061b authored May 02, 2023 by Hailey Schoelkopf Committed by GitHub May 02, 2023
12 changed files
--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
@@ -22,4 +22,10 @@ HIGHER_IS_BETTER_REGISTRY = {
    "bleu": True,
    "chrf": True,
    "ter": False,
+
+    "acc": True,
+    "acc_norm": True,
+    "word_perplexity": False,
+    "byte_perplexity": False,
+    "bits_per_byte": False,
 }
\ No newline at end of file
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -11,6 +11,7 @@ class Instance:
    resps: list = field(default_factory=list)
    filtered_resps: dict = field(default_factory=dict)

+    # initialized after init
    task_name: str = None
    doc_id: str = None
    repeats: str = None

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -10,7 +10,12 @@ import evaluate


 AGGREGATION_REGISTRY = {}
-METRIC_REGISTRY = {}
+METRIC_REGISTRY = {
+    "acc": None,
+    "acc_norm": None,
+    "word_perplexity": None,
+    "byte_perplexity": None,
+}


 def register_metric(name):
@@ -45,6 +50,7 @@ searching in HF Evaluate library...")


 def register_aggregation(name):
+    # TODO: should we enforce a specific interface to aggregation metrics?
    def decorate(fn):
        assert (
            name not in AGGREGATION_REGISTRY
@@ -155,6 +161,7 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):


 @register_metric("perplexity")
+@register_aggregation("perplexity")
 def perplexity(items):
    return math.exp(-mean(items))

@@ -165,10 +172,13 @@ def weighted_mean(items):


 @register_metric("weighted_perplexity")
+@register_aggregation("weighted_perplexity")
 def weighted_perplexity(items):
    return math.exp(-weighted_mean(items))


+@register_metric("bits_per_byte")
+@register_aggregation("bits_per_byte")
 def bits_per_byte(items):
    return -weighted_mean(items) / math.log(2)


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc

+from typing import Union
+
 from lm_eval import utils

 MODEL_REGISTRY = {}

-def register_model(name):
-    # TODO: should fairseq/elk be cited for this design pattern?
-
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    
    def decorate(cls):
-        assert (
-            issubclass(cls, LM)
-        ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+        for name in names: 
+            assert (
+                issubclass(cls, LM)
+            ), f"Model '{name}' ({cls.__name__}) must extend LM class"

-        assert (
-            name not in MODEL_REGISTRY
-        ), f"Model named '{name}' conflicts with existing model!"
+            assert (
+                name not in MODEL_REGISTRY
+            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."

-        MODEL_REGISTRY[name] = cls
+            MODEL_REGISTRY[name] = cls
        return cls
    
    return decorate

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -5,13 +5,15 @@ import re
 import evaluate
 import random
 import itertools
+import functools

 import datasets
 import numpy as np

 from typing import List, Union

-from lm_eval.api import METRIC_REGISTRY, AGGREGATION_REGISTRY, HIGHER_IS_BETTER_REGISTRY
+from lm_eval.api.metrics import METRIC_REGISTRY, AGGREGATION_REGISTRY
+from lm_eval.api import HIGHER_IS_BETTER_REGISTRY
 from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import get_metric, get_aggregation, mean, weighted_perplexity, bits_per_byte
 from lm_eval import utils
@@ -36,10 +38,11 @@ class TaskConfig(dict):
    doc_to_text: str = ""
    doc_to_target: str = ""

-    # aggregation: dict = None # TODO: remove, I think these 2 are obsolete w/ current metric_list impl.
-    # higher_is_better: dict = None
+
    num_fewshot: int = 0
    batch_size: int = 1
+    repeats: int = 1
+
    metric_list: str = None
    gold_alias: str = None
    output_type: str = "greedy_until"
@@ -122,7 +125,8 @@ class Task(abc.ABC):
                filter_pipeline = build_filter_ensemble(name, components)
                self._filters.append(filter_pipeline)

-        self.sampler = samplers.Sampler(self.training_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here
+ 
+        self.sampler = samplers.Sampler(self.fewshot_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here

    def download(self, data_dir=None, cache_dir=None, download_mode=None):
        """Downloads and returns the task dataset.
@@ -193,6 +197,19 @@ class Task(abc.ABC):
        """
        return []

+    def fewshot_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            # TODO: should we allow this case to occur? / should raise a warning here
+            return self.test_docs()
+
    def _process_doc(self, doc):
        """
        Override this to process (detokenize, strip, replace, etc.) individual
@@ -313,6 +330,16 @@ class Task(abc.ABC):
        """
        pass

+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+
    @utils.positional_deprecated
    def fewshot_context(self, doc, num_fewshot, rnd=None):
        """Returns a fewshot context string that is made up of a prepended description
@@ -336,33 +363,33 @@ class Task(abc.ABC):
            labeled_examples = ""
        else:

-            # labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)
+            labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)

            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
-            if self.has_training_docs():
-                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
-            else:
-                if self._fewshot_docs is None:
-                    self._fewshot_docs = list(
-                        self.validation_docs()
-                        if self.has_validation_docs()
-                        else self.test_docs()
-                    )
-
-                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
-
-                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
-
-            labeled_examples = (
-                "\n\n".join(
-                    [
-                        self.doc_to_text(doc) + self.doc_to_target(doc)
-                        for doc in fewshotex
-                    ]
-                )
-                + "\n\n"
-            )
+            # if self.has_training_docs():
+            #     fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            # else:
+            #     if self._fewshot_docs is None:
+            #         self._fewshot_docs = list(
+            #             self.validation_docs()
+            #             if self.has_validation_docs()
+            #             else self.test_docs()
+            #         )
+
+            #     fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+            #     # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+            #     fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            # labeled_examples = (
+            #     "\n\n".join(
+            #         [
+            #             self.doc_to_text(doc) + self.doc_to_target(doc)
+            #             for doc in fewshotex
+            #         ]
+            #     )
+            #     + "\n\n"
+            # )

        example = self.doc_to_text(doc)
        return labeled_examples + example
@@ -376,7 +403,7 @@ class Task(abc.ABC):
 class ConfigurableTask(Task):

    VERSION = "2.0"
-    OUTPUT_TYPE = "greedy_until"
+    OUTPUT_TYPE = None

    def __init__(
        self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
@@ -432,6 +459,8 @@ class ConfigurableTask(Task):
        for name, components in self._config.get("filters", [["none", ["take_first"]]]):
            filter_pipeline = build_filter_ensemble(name, components)
            self._filters.append(filter_pipeline)
+        
+        self.sampler = samplers.Sampler(list(self.fewshot_docs()), self, rnd=random.Random()) # TODO: pass the correct docs in here

    def has_training_docs(self):
        if self._config.training_split is not None:
@@ -463,6 +492,13 @@ class ConfigurableTask(Task):
        if self._config.test_split is not None:
            return self.dataset[self._config.test_split]

+    def fewshot_docs(self):
+        if self._config.fewshot_split:
+            return self.dataset[self._config.fewshot_split]
+        else:
+            # TODO: warn user if fewshot split isn't explicitly set
+            return super().fewshot_docs()
+
    def should_decontaminate(self):
        return self._config.should_decontaminate

@@ -497,6 +533,19 @@ class ConfigurableTask(Task):
            arguments=(ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
            arguments=(self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            import ast
+            return [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc, 
+                    arguments=(ctx, " {}".format(choice)),
+                    id_=i,
+                    **kwargs,
+                )
+                for i, choice in enumerate(ast.literal_eval(utils.apply_template(self._config.template_aliases + "{{answer_choices}}", doc))) 
+                # we pass the user-defined answer_choices var (in aliases) and echo the result. TODO: any cleaner way to do this?
+            ]
        elif self.OUTPUT_TYPE == "greedy_until":
            arguments=(ctx, self._config.delimiter)

@@ -504,6 +553,7 @@ class ConfigurableTask(Task):
            request_type=self.OUTPUT_TYPE,
            doc=doc,
            arguments=arguments,
+            id_=0,
            **kwargs
            )

@@ -515,7 +565,30 @@ class ConfigurableTask(Task):
            ll, is_greedy = results
            result_dict = {"perplexity": ll, "accuracy": int(is_greedy)}
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
-            pass
+            (loglikelihood,) = results
+            words = self.count_words(self.doc_to_target(doc))
+            bytes_ = self.count_bytes(self.doc_to_target(doc))
+            return {
+                "word_perplexity": (loglikelihood, words),
+                "byte_perplexity": (loglikelihood, bytes_),
+                "bits_per_byte": (loglikelihood, bytes_),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: keep is_greedy to report exact_match as well on multiple choice probs
+            gold = int(self.doc_to_target(doc))
+            # TODO: remove dependence on "gold" and "choices" columns
+
+            acc = 1.0 if np.argmax(lls) == gold else 0.0
+            completion_len = np.array([float(len(i)) for i in doc["choices"]])
+            acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+            # TODO: set which normalization metrics should be reported, and calculate them
+            # TODO: add mutual info.
+
+            result_dict = {
+                "acc": acc,
+                "acc_norm": acc_norm,
+            } 
        elif self.OUTPUT_TYPE == "greedy_until":

            if self._config.gold_alias is not None:
@@ -531,6 +604,10 @@ class ConfigurableTask(Task):
                )

                result_dict[key] = _dict[key]
+        else:
+            raise ValueError(f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", 
+                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until'"
+            )

        return result_dict

@@ -558,11 +635,6 @@ class MultipleChoiceTask(Task):
                **kwargs,
            )
            for i, choice in enumerate(doc["choices"])]
-        #lls = [
-        #    rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
-        # ]
-
-        # return lls

    def process_results(self, doc, results):
        results = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere? 
@@ -638,8 +710,8 @@ class PerplexityTask(Task, abc.ABC):

    def process_results(self, doc, results):
        (loglikelihood,) = results
-        words = self.count_words(doc)
-        bytes_ = self.count_bytes(doc)
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
        return {
            "word_perplexity": (loglikelihood, words),
            "byte_perplexity": (loglikelihood, bytes_),
@@ -668,19 +740,22 @@ class PerplexityTask(Task, abc.ABC):
 TASK_REGISTRY = {}
 ALL_TASKS = []

-def register_task(name):
+def register_task(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings

    def decorate(cls):
-        assert (
-            issubclass(cls, Task)
-        ), f"Task '{name}' ({cls.__name__}) must extend Task class"
+        for name in names:
+            assert (
+                issubclass(cls, Task)
+            ), f"Task '{name}' ({cls.__name__}) must extend Task class"

-        assert (
-            name not in TASK_REGISTRY
-        ), f"Task named '{name}' conflicts with existing task!"
+            assert (
+                name not in TASK_REGISTRY
+            ), f"Task named '{name}' conflicts with existing task! Please register with a non-conflicting alias instead."

-        TASK_REGISTRY[name] = cls
-        ALL_TASKS = sorted(list(TASK_REGISTRY)) # TODO: this doesn't seem to import right.
+            TASK_REGISTRY[name] = cls
+            ALL_TASKS = sorted(list(TASK_REGISTRY)) # TODO: this doesn't seem to import right.
        return cls
    
    return decorate

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -145,7 +145,8 @@ def evaluate(
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
-        requests[task.OUTPUT_TYPE].extend(task.instances) 
+        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
+        requests[reqtype].extend(task.instances) 
    
    ### Run LM on inputs, get all outputs ###
    # execute each type of request

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -9,7 +9,7 @@ from lm_eval import utils
 from lm_eval.api.model import LM, register_model


-@register_model("hf-causal")
+@register_model("hf-causal", "gpt2")
 class HFLM(LM):
    def __init__(
        self,

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -41,7 +41,7 @@ from . import lambada
 # from . import hendrycks_math
 # from . import cbt
 # from . import lambada_cloze
-# from . import pile
+from . import pile
 from . import wikitext
 # from . import lambada_multilingual
 # from . import mutual

--- a/lm_eval/tasks/arc.yaml
+++ b/lm_eval/tasks/arc.yaml
 dataset_path: ai2_arc
 dataset_name: ARC-Challenge
+output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-doc_to_text: "Q: {{question}}\nA:"
-doc_to_target: "{% set answer_choices = doc['choices']['text'] %}{{answer_choices[int(doc['answerKey']) - 1]}}"
+template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{gold}}" # this will be cast to an int. 
 metric_list:
-  - metric: exact_match
+  - metric: acc
    aggregation: mean
    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/lambada.yaml
+++ b/lm_eval/tasks/lambada.yaml
 dataset_path: EleutherAI/lambada_openai
 dataset_name: default
-output_type: "loglikelihood"
+output_type: loglikelihood
 test_split: test
-template_aliases: "{% set hypo = hypothesis %}"
+template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true
@@ -12,5 +12,5 @@ metric_list:
    aggregation: perplexity
    higher_is_better: true
  - metric: accuracy
-    aggregation: perplexity
+    aggregation: mean
    higher_is_better: true
--- a/lm_eval/tasks/pile.py
+++ b/lm_eval/tasks/pile.py
+"""
+The Pile: An 800GB Dataset of Diverse Text for Language Modeling
+https://arxiv.org/pdf/2101.00027.pdf
+
+The Pile is a 825 GiB diverse, open source language modelling data set that consists
+of 22 smaller, high-quality datasets combined together. To score well on Pile
+BPB (bits per byte), a model must be able to understand many disparate domains
+including books, github repositories, webpages, chat logs, and medical, physics,
+math, computer science, and philosophy papers.
+Homepage: https://pile.eleuther.ai/
+"""
+
+from lm_eval.api.task import PerplexityTask, register_task
+
+
+_CITATION = """
+@article{pile,
+  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+  journal={arXiv preprint arXiv:2101.00027},
+  year={2020}
+}
+"""
+
+
+class PilePerplexityTask(PerplexityTask):
+    VERSION = "2.0"
+    DATASET_PATH = "EleutherAI/the_pile"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def test_docs(self):
+        for doc in self.dataset["train"].select(range(100)):
+            yield doc
+    
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_target(self, doc):
+        return doc["text"]
+
+    # def validation_docs(self):
+    #     for doc in self.dataset["validation"]:
+    #         yield doc["text"]
+
+    # def test_docs(self):
+    #     for doc in self.dataset["test"]:
+    #         yield doc["text"]
+
+
+class PileArxiv(PilePerplexityTask):
+    DATASET_NAME = "pile_arxiv"
+
+
+class PileBooks3(PilePerplexityTask):
+    DATASET_NAME = "pile_books3"
+
+
+class PileBookCorpus2(PilePerplexityTask):
+    DATASET_NAME = "pile_bookcorpus2"
+
+
+class PileDmMathematics(PilePerplexityTask):
+    DATASET_NAME = "pile_dm-mathematics"
+
+
+@register_task("pile_enron")
+class PileEnron(PilePerplexityTask):
+    DATASET_NAME = "enron_emails"
+
+
+class PileEuroparl(PilePerplexityTask):
+    DATASET_NAME = "pile_europarl"
+
+
+class PileFreeLaw(PilePerplexityTask):
+    DATASET_NAME = "pile_freelaw"
+
+
+class PileGithub(PilePerplexityTask):
+    DATASET_NAME = "pile_github"
+
+
+class PileGutenberg(PilePerplexityTask):
+    DATASET_NAME = "pile_gutenberg"
+
+
+class PileHackernews(PilePerplexityTask):
+    DATASET_NAME = "pile_hackernews"
+
+
+class PileNIHExporter(PilePerplexityTask):
+    DATASET_NAME = "pile_nih-exporter"
+
+
+class PileOpenSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_opensubtitles"
+
+
+class PileOpenWebText2(PilePerplexityTask):
+    DATASET_NAME = "pile_openwebtext2"
+
+
+class PilePhilPapers(PilePerplexityTask):
+    DATASET_NAME = "pile_philpapers"
+
+
+class PilePileCc(PilePerplexityTask):
+    DATASET_NAME = "pile_pile-cc"
+
+
+class PilePubmedAbstracts(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-abstracts"
+
+
+class PilePubmedCentral(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-central"
+
+
+class PileStackExchange(PilePerplexityTask):
+    DATASET_NAME = "pile_stackexchange"
+
+
+class PileUspto(PilePerplexityTask):
+    DATASET_NAME = "pile_upsto"
+
+
+class PileUbuntuIrc(PilePerplexityTask):
+    DATASET_NAME = "pile_ubuntu-irc"
+
+
+class PileWikipedia(PilePerplexityTask):
+    DATASET_NAME = "pile_wikipedia"
+
+
+class PileYoutubeSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_youtubesubtitles"
\ No newline at end of file
--- a/lm_eval/tasks/pile_enron.yaml
+++ b/lm_eval/tasks/pile_enron.yaml
+dataset_path: EleutherAI/the_pile
+dataset_name: enron_emails
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file