Merge pull request #438 from EleutherAI/configurable-tasks

Configurable-Tasks

Merge pull request #438 from EleutherAI/configurable-tasks
Configurable-Tasks
2da74953 · Hailey Schoelkopf · GitHub · fa686d04 · d2b16757 · 2da74953
Unverified Commit 2da74953 authored May 02, 2023 by Hailey Schoelkopf Committed by GitHub May 02, 2023
15 changed files
--- a/examples/configurable_task/sglue_cb.yaml
+++ b/examples/configurable_task/sglue_cb.yaml
@@ -5,10 +5,9 @@ validation_split: validation
 template_aliases: "{% set hypo = hypothesis %}"
 doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypo}}\"? Yes, no, or maybe?"
 doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
-metric_list: [
-  [exact_match, mean, true]
-  ]
-# filters: [
-#   ["none", ["take_first"]]
-# ]
-
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
\ No newline at end of file
--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
+from . import metrics
+
+METRIC_REGISTRY = {
+    "matthews_corrcoef": metrics.matthews_corrcoef,
+    "f1_score": metrics.f1_score,
+    "perplexity": metrics.perplexity,
+    "bleu": metrics.bleu,
+    "chrf": metrics.chrf,
+    "ter": metrics.ter,
+}
+
+AGGREGATION_REGISTRY = {
+    "mean": metrics.mean,
+    "median": metrics.median,
+    "perplexity": metrics.perplexity,
+}
+
+HIGHER_IS_BETTER_REGISTRY = {
+    "matthews_corrcoef": True,
+    "f1_score": True,
+    "perplexity": False,
+    "bleu": True,
+    "chrf": True,
+    "ter": False,
+
+    "acc": True,
+    "acc_norm": True,
+    "word_perplexity": False,
+    "byte_perplexity": False,
+    "bits_per_byte": False,
+}
\ No newline at end of file
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -11,6 +11,7 @@ class Instance:
    resps: list = field(default_factory=list)
    filtered_resps: dict = field(default_factory=dict)

+    # initialized after init
    task_name: str = None
    doc_id: str = None
    repeats: str = None

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -10,7 +10,12 @@ import evaluate


 AGGREGATION_REGISTRY = {}
-METRIC_REGISTRY = {}
+METRIC_REGISTRY = {
+    "acc": None,
+    "acc_norm": None,
+    "word_perplexity": None,
+    "byte_perplexity": None,
+}


 def register_metric(name):
@@ -45,6 +50,7 @@ searching in HF Evaluate library...")


 def register_aggregation(name):
+    # TODO: should we enforce a specific interface to aggregation metrics?
    def decorate(fn):
        assert (
            name not in AGGREGATION_REGISTRY
@@ -155,6 +161,7 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):


 @register_metric("perplexity")
+@register_aggregation("perplexity")
 def perplexity(items):
    return math.exp(-mean(items))

@@ -165,10 +172,13 @@ def weighted_mean(items):


 @register_metric("weighted_perplexity")
+@register_aggregation("weighted_perplexity")
 def weighted_perplexity(items):
    return math.exp(-weighted_mean(items))


+@register_metric("bits_per_byte")
+@register_aggregation("bits_per_byte")
 def bits_per_byte(items):
    return -weighted_mean(items) / math.log(2)


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc

+from typing import Union
+
 from lm_eval import utils

 MODEL_REGISTRY = {}

-def register_model(name):
-    # TODO: should fairseq/elk be cited for this design pattern?
-
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    
    def decorate(cls):
-        assert (
-            issubclass(cls, LM)
-        ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+        for name in names: 
+            assert (
+                issubclass(cls, LM)
+            ), f"Model '{name}' ({cls.__name__}) must extend LM class"

-        assert (
-            name not in MODEL_REGISTRY
-        ), f"Model named '{name}' conflicts with existing model!"
+            assert (
+                name not in MODEL_REGISTRY
+            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."

-        MODEL_REGISTRY[name] = cls
+            MODEL_REGISTRY[name] = cls
        return cls
    
    return decorate

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -5,15 +5,19 @@ import re
 import evaluate
 import random
 import itertools
+import functools

 import datasets
 import numpy as np

 from typing import List, Union

+from lm_eval.api.metrics import METRIC_REGISTRY, AGGREGATION_REGISTRY
+from lm_eval.api import HIGHER_IS_BETTER_REGISTRY
 from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import get_metric, get_aggregation, mean, weighted_perplexity, bits_per_byte
 from lm_eval import utils
+from lm_eval.prompts import get_prompt

 from lm_eval.filters import build_filter_ensemble
 from lm_eval.api import samplers
@@ -34,17 +38,20 @@ class TaskConfig(dict):
    doc_to_text: str = ""
    doc_to_target: str = ""

-    # aggregation: dict = None # TODO: remove, I think these 2 are obsolete w/ current metric_list impl.
-    # higher_is_better: dict = None
+
    num_fewshot: int = 0
    batch_size: int = 1
+    repeats: int = 1
+
    metric_list: str = None
    gold_alias: str = None
    output_type: str = "greedy_until"
    delimiter: str = "\n\n"
    filters: str = None #TODO: need to make this typehint `list`?
    normalization: str = None # TODO: add length-normalization of various types, mutual info
-    stop_sequences: list = None # TODO: allow passing of stop sequences to greedy gen.
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: str = None
+    use_prompt: str = None

    def __post_init__(self):
        # allow user-specified aliases so that users can
@@ -118,7 +125,8 @@ class Task(abc.ABC):
                filter_pipeline = build_filter_ensemble(name, components)
                self._filters.append(filter_pipeline)

-        self.sampler = samplers.Sampler(self.training_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here
+ 
+        self.sampler = samplers.Sampler(self.fewshot_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here

    def download(self, data_dir=None, cache_dir=None, download_mode=None):
        """Downloads and returns the task dataset.
@@ -189,6 +197,19 @@ class Task(abc.ABC):
        """
        return []

+    def fewshot_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            # TODO: should we allow this case to occur? / should raise a warning here
+            return self.test_docs()
+
    def _process_doc(self, doc):
        """
        Override this to process (detokenize, strip, replace, etc.) individual
@@ -309,6 +330,16 @@ class Task(abc.ABC):
        """
        pass

+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+
    @utils.positional_deprecated
    def fewshot_context(self, doc, num_fewshot, rnd=None):
        """Returns a fewshot context string that is made up of a prepended description
@@ -332,33 +363,33 @@ class Task(abc.ABC):
            labeled_examples = ""
        else:

-            # labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)
+            labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)

            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
-            if self.has_training_docs():
-                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
-            else:
-                if self._fewshot_docs is None:
-                    self._fewshot_docs = list(
-                        self.validation_docs()
-                        if self.has_validation_docs()
-                        else self.test_docs()
-                    )
-
-                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
-
-                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
-
-            labeled_examples = (
-                "\n\n".join(
-                    [
-                        self.doc_to_text(doc) + self.doc_to_target(doc)
-                        for doc in fewshotex
-                    ]
-                )
-                + "\n\n"
-            )
+            # if self.has_training_docs():
+            #     fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            # else:
+            #     if self._fewshot_docs is None:
+            #         self._fewshot_docs = list(
+            #             self.validation_docs()
+            #             if self.has_validation_docs()
+            #             else self.test_docs()
+            #         )
+
+            #     fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+            #     # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+            #     fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            # labeled_examples = (
+            #     "\n\n".join(
+            #         [
+            #             self.doc_to_text(doc) + self.doc_to_target(doc)
+            #             for doc in fewshotex
+            #         ]
+            #     )
+            #     + "\n\n"
+            # )

        example = self.doc_to_text(doc)
        return labeled_examples + example
@@ -372,13 +403,17 @@ class Task(abc.ABC):
 class ConfigurableTask(Task):

    VERSION = "2.0"
-    OUTPUT_TYPE = "greedy_until"
+    OUTPUT_TYPE = None

    def __init__(
        self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
    ):

        self._config = TaskConfig(**config)
+
+        if self._config.output_type is not None:
+            self.OUTPUT_TYPE = self._config.output_type
+
        if self._config.dataset_path is not None:
            self.DATASET_PATH = self._config.dataset_path

@@ -387,27 +422,33 @@ class ConfigurableTask(Task):

        if self._config.metric_list is not None:
            self._metric_list = {}
+            self._metric_kwargs = {}
            self._aggregation_list = {}
            self._higher_is_better = {}
-            for (metric_name, aggregation, higher_is_better) in self._config.metric_list:
-
-                self._aggregation_list[metric_name] = get_aggregation(aggregation)
-                self._higher_is_better[metric_name] = higher_is_better
-
-                self._metric_list[metric_name] = get_metric(metric_name)
-
-
-                # if metric_name in METRIC_REGISTRY.keys():
-                #     self._metric_list[metric_name] = METRIC_REGISTRY[metric_name]
-                # else:
-                #     try:
-                #         metric_object = evaluate.load(metric_name)
-                #         self._metric_list[metric_name] = metric_object
-                #     except Exception as ex:
-                #         raise Warning(
-                #             "{} not found in the evaluate library!".format(metric_name),
-                #             "Please check https://huggingface.co/evaluate-metric",
-                #         )
+            for metric_config in self._config.metric_list:
+
+                metric_name = metric_config['metric']
+                aggregation = metric_config['aggregation']
+                higher_is_better = metric_config['higher_is_better']
+                kwargs = {key: metric_config[key] for key in metric_config if key not in ['metric', 'aggregation', 'higher_is_better']}
+
+                self._aggregation_list[metric_name] = AGGREGATION_REGISTRY[aggregation]
+
+                if metric_name in METRIC_REGISTRY.keys():
+                    self._metric_list[metric_name] = METRIC_REGISTRY[metric_name]
+                    self._higher_is_better[metric_name] = HIGHER_IS_BETTER_REGISTRY[metric_name]
+                else:
+                    self._higher_is_better[metric_name] = higher_is_better
+                    try:
+                        metric_object = evaluate.load(metric_name)
+                        self._metric_list[metric_name] = metric_object
+                        self._metric_kwargs[metric_name] = kwargs
+
+                    except Exception as ex:
+                        raise Warning(
+                            "{} not found in the evaluate library!".format(metric_name),
+                            "Please check https://huggingface.co/evaluate-metric",
+                        )

        self.download(data_dir, cache_dir, download_mode)
        self._training_docs = None
@@ -418,6 +459,8 @@ class ConfigurableTask(Task):
        for name, components in self._config.get("filters", [["none", ["take_first"]]]):
            filter_pipeline = build_filter_ensemble(name, components)
            self._filters.append(filter_pipeline)
+        
+        self.sampler = samplers.Sampler(list(self.fewshot_docs()), self, rnd=random.Random()) # TODO: pass the correct docs in here

    def has_training_docs(self):
        if self._config.training_split is not None:
@@ -449,6 +492,20 @@ class ConfigurableTask(Task):
        if self._config.test_split is not None:
            return self.dataset[self._config.test_split]

+    def fewshot_docs(self):
+        if self._config.fewshot_split:
+            return self.dataset[self._config.fewshot_split]
+        else:
+            # TODO: warn user if fewshot split isn't explicitly set
+            return super().fewshot_docs()
+
+    def should_decontaminate(self):
+        return self._config.should_decontaminate
+
+    def doc_to_decontamination_query(self, doc):
+        if self._config.should_decontaminate:
+            return utils.apply_template(self._config.doc_to_decontamination_query, doc)
+
    def _process_doc(self, doc):
        """
        Override this to process (detokenize, strip, replace, etc.) individual
@@ -461,40 +518,103 @@ class ConfigurableTask(Task):
        return doc

    def doc_to_text(self, doc):
-        return utils.apply_template(self._config.doc_to_text, doc)
+        if self._config.use_prompt is not None:
+            doc_to_text = get_prompt(self._config.use_prompt)
+        else:
+            doc_to_text = self._config.doc_to_text
+        return utils.apply_template(doc_to_text, doc)

    def doc_to_target(self, doc):
        return utils.apply_template(self._config.doc_to_target, doc)

    def construct_requests(self, doc, ctx, **kwargs):

-        if self.OUTPUT_TYPE == "greedy_until":
-            return Instance(request_type=self.OUTPUT_TYPE, doc=doc, arguments=(ctx, "\n\n"), idx=0, **kwargs)
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments=(ctx, self.doc_to_target(doc))
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            arguments=(self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            import ast
+            return [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc, 
+                    arguments=(ctx, " {}".format(choice)),
+                    idx=i,
+                    **kwargs,
+                )
+                for i, choice in enumerate(ast.literal_eval(utils.apply_template(self._config.template_aliases + "{{answer_choices}}", doc))) 
+                # we pass the user-defined answer_choices var (in aliases) and echo the result. TODO: any cleaner way to do this?
+            ]
+        elif self.OUTPUT_TYPE == "greedy_until":
+            arguments=(ctx, self._config.delimiter)
+
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs
+            )

    def process_results(self, doc, results):

-        if self._config.gold_alias is not None:
-            gold = doc[self._config.gold_alias]
-        else:
-            gold = self.doc_to_target(doc)
-
        result_dict = {}
-        for key, result in zip(self._metric_list.keys(), results):
-            _dict = self._metric_list[key](
-                references=[gold],
-                predictions=[result],
-            )
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            result_dict = {"perplexity": ll, "accuracy": int(is_greedy)}
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            (loglikelihood,) = results
+            words = self.count_words(self.doc_to_target(doc))
+            bytes_ = self.count_bytes(self.doc_to_target(doc))
+            return {
+                "word_perplexity": (loglikelihood, words),
+                "byte_perplexity": (loglikelihood, bytes_),
+                "bits_per_byte": (loglikelihood, bytes_),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: keep is_greedy to report exact_match as well on multiple choice probs
+            gold = int(self.doc_to_target(doc))
+            # TODO: remove dependence on "gold" and "choices" columns
+
+            acc = 1.0 if np.argmax(lls) == gold else 0.0
+            completion_len = np.array([float(len(i)) for i in doc["choices"]])
+            acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+            # TODO: set which normalization metrics should be reported, and calculate them
+            # TODO: add mutual info.
+
+            result_dict = {
+                "acc": acc,
+                "acc_norm": acc_norm,
+            } 
+        elif self.OUTPUT_TYPE == "greedy_until":
+
+            if self._config.gold_alias is not None:
+                gold = doc[self._config.gold_alias]
+            else:
+                gold = self.doc_to_target(doc)
+
+            for key, result in zip(self._metric_list.keys(), results):
+                _dict = self._metric_list[key].compute(
+                    references=[gold],
+                    predictions=[result],
+                    **self._metric_kwargs[key]
+                )

-            result_dict[key] = _dict[key]
+                result_dict[key] = _dict[key]
+        else:
+            raise ValueError(f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", 
+                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until'"
+            )

        return result_dict

    def aggregation(self):
-
        return self._aggregation_list

    def higher_is_better(self):
-        
        return self._higher_is_better


@@ -515,11 +635,6 @@ class MultipleChoiceTask(Task):
                **kwargs,
            )
            for i, choice in enumerate(doc["choices"])]
-        #lls = [
-        #    rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
-        # ]
-
-        # return lls

    def process_results(self, doc, results):
        results = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere? 
@@ -595,8 +710,8 @@ class PerplexityTask(Task, abc.ABC):

    def process_results(self, doc, results):
        (loglikelihood,) = results
-        words = self.count_words(doc)
-        bytes_ = self.count_bytes(doc)
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
        return {
            "word_perplexity": (loglikelihood, words),
            "byte_perplexity": (loglikelihood, bytes_),
@@ -625,19 +740,22 @@ class PerplexityTask(Task, abc.ABC):
 TASK_REGISTRY = {}
 ALL_TASKS = []

-def register_task(name):
+def register_task(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings

    def decorate(cls):
-        assert (
-            issubclass(cls, Task)
-        ), f"Task '{name}' ({cls.__name__}) must extend Task class"
+        for name in names:
+            assert (
+                issubclass(cls, Task)
+            ), f"Task '{name}' ({cls.__name__}) must extend Task class"

-        assert (
-            name not in TASK_REGISTRY
-        ), f"Task named '{name}' conflicts with existing task!"
+            assert (
+                name not in TASK_REGISTRY
+            ), f"Task named '{name}' conflicts with existing task! Please register with a non-conflicting alias instead."

-        TASK_REGISTRY[name] = cls
-        ALL_TASKS = sorted(list(TASK_REGISTRY)) # TODO: this doesn't seem to import right.
+            TASK_REGISTRY[name] = cls
+            ALL_TASKS = sorted(list(TASK_REGISTRY)) # TODO: this doesn't seem to import right.
        return cls
    
    return decorate

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -145,7 +145,8 @@ def evaluate(
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
-        requests[task.OUTPUT_TYPE].extend(task.instances) 
+        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
+        requests[reqtype].extend(task.instances) 
    
    ### Run LM on inputs, get all outputs ###
    # execute each type of request

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -9,7 +9,7 @@ from lm_eval import utils
 from lm_eval.api.model import LM, register_model


-@register_model("hf-causal")
+@register_model("hf-causal", "gpt2")
 class HFLM(LM):
    def __init__(
        self,

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -41,7 +41,7 @@ from . import lambada
 # from . import hendrycks_math
 # from . import cbt
 # from . import lambada_cloze
-# from . import pile
+from . import pile
 from . import wikitext
 # from . import lambada_multilingual
 # from . import mutual

--- a/lm_eval/tasks/arc.yaml
+++ b/lm_eval/tasks/arc.yaml
 dataset_path: ai2_arc
 dataset_name: ARC-Challenge
+output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-doc_to_text: "Q: {{question}}\nA:"
-doc_to_target: "{% set answer_choices = doc['choices']['text'] %}{{answer_choices[int(doc['answerKey']) - 1]}}"
-metric_list: [
-  [exact_match, mean, true]
-  ]
+template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{gold}}" # this will be cast to an int. 
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k.yaml
@@ -2,12 +2,15 @@ dataset_path: gsm8k
 dataset_name: main
 training_split: train
 test_split: test
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer}}" # TODO: this field needs to change to account for the regexing that happens etc.
-metric_list: [
-  [acc, mean, true]
-  ]
-filters: [
-  ["regex", ["regex", "take_first"]]
-]
-stop_sequences: ["\n"]
\ No newline at end of file
+doc_to_target: "{{answer.split('### ')[-1]}}"
+use_prompt: "qa-basic:question-newline-answer"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+delimiter: "\n"
+# filters: [
+#   ["regex", ["regex", "take_first"]]
+# ]
\ No newline at end of file
--- a/lm_eval/tasks/lambada.yaml
+++ b/lm_eval/tasks/lambada.yaml
+dataset_path: EleutherAI/lambada_openai
+dataset_name: default
+output_type: loglikelihood
+test_split: test
+template_aliases: ""
+doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
+doc_to_target: "{{' '+text.split(' ')[-1]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: perplexity
+    aggregation: perplexity
+    higher_is_better: true
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/pile.py
+++ b/lm_eval/tasks/pile.py
+"""
+The Pile: An 800GB Dataset of Diverse Text for Language Modeling
+https://arxiv.org/pdf/2101.00027.pdf
+
+The Pile is a 825 GiB diverse, open source language modelling data set that consists
+of 22 smaller, high-quality datasets combined together. To score well on Pile
+BPB (bits per byte), a model must be able to understand many disparate domains
+including books, github repositories, webpages, chat logs, and medical, physics,
+math, computer science, and philosophy papers.
+Homepage: https://pile.eleuther.ai/
+"""
+
+from lm_eval.api.task import PerplexityTask, register_task
+
+
+_CITATION = """
+@article{pile,
+  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+  journal={arXiv preprint arXiv:2101.00027},
+  year={2020}
+}
+"""
+
+
+class PilePerplexityTask(PerplexityTask):
+    VERSION = "2.0"
+    DATASET_PATH = "EleutherAI/the_pile"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def test_docs(self):
+        for doc in self.dataset["train"].select(range(100)):
+            yield doc
+    
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_target(self, doc):
+        return doc["text"]
+
+    # def validation_docs(self):
+    #     for doc in self.dataset["validation"]:
+    #         yield doc["text"]
+
+    # def test_docs(self):
+    #     for doc in self.dataset["test"]:
+    #         yield doc["text"]
+
+
+class PileArxiv(PilePerplexityTask):
+    DATASET_NAME = "pile_arxiv"
+
+
+class PileBooks3(PilePerplexityTask):
+    DATASET_NAME = "pile_books3"
+
+
+class PileBookCorpus2(PilePerplexityTask):
+    DATASET_NAME = "pile_bookcorpus2"
+
+
+class PileDmMathematics(PilePerplexityTask):
+    DATASET_NAME = "pile_dm-mathematics"
+
+
+@register_task("pile_enron")
+class PileEnron(PilePerplexityTask):
+    DATASET_NAME = "enron_emails"
+
+
+class PileEuroparl(PilePerplexityTask):
+    DATASET_NAME = "pile_europarl"
+
+
+class PileFreeLaw(PilePerplexityTask):
+    DATASET_NAME = "pile_freelaw"
+
+
+class PileGithub(PilePerplexityTask):
+    DATASET_NAME = "pile_github"
+
+
+class PileGutenberg(PilePerplexityTask):
+    DATASET_NAME = "pile_gutenberg"
+
+
+class PileHackernews(PilePerplexityTask):
+    DATASET_NAME = "pile_hackernews"
+
+
+class PileNIHExporter(PilePerplexityTask):
+    DATASET_NAME = "pile_nih-exporter"
+
+
+class PileOpenSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_opensubtitles"
+
+
+class PileOpenWebText2(PilePerplexityTask):
+    DATASET_NAME = "pile_openwebtext2"
+
+
+class PilePhilPapers(PilePerplexityTask):
+    DATASET_NAME = "pile_philpapers"
+
+
+class PilePileCc(PilePerplexityTask):
+    DATASET_NAME = "pile_pile-cc"
+
+
+class PilePubmedAbstracts(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-abstracts"
+
+
+class PilePubmedCentral(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-central"
+
+
+class PileStackExchange(PilePerplexityTask):
+    DATASET_NAME = "pile_stackexchange"
+
+
+class PileUspto(PilePerplexityTask):
+    DATASET_NAME = "pile_upsto"
+
+
+class PileUbuntuIrc(PilePerplexityTask):
+    DATASET_NAME = "pile_ubuntu-irc"
+
+
+class PileWikipedia(PilePerplexityTask):
+    DATASET_NAME = "pile_wikipedia"
+
+
+class PileYoutubeSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_youtubesubtitles"
\ No newline at end of file
--- a/lm_eval/tasks/pile_enron.yaml
+++ b/lm_eval/tasks/pile_enron.yaml
+dataset_path: EleutherAI/the_pile
+dataset_name: enron_emails
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/sglue_cb.yaml
+++ b/lm_eval/tasks/sglue_cb.yaml
+dataset_path: super_glue
+dataset_name: cb
+training_split: train
+validation_split: validation
+doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe?"
+doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true