Merge pull request #671 from EleutherAI/revamp-process

Revamp process

Merge pull request #671 from EleutherAI/revamp-process
Revamp process
2c20cd1f · Lintang Sutawika · GitHub · 6862fa7d · 0dadc92a · 2c20cd1f
Unverified Commit 2c20cd1f authored Jul 14, 2023 by Lintang Sutawika Committed by GitHub Jul 14, 2023
20 changed files
--- a/docs/advanced_task_guide.md
+++ b/docs/advanced_task_guide.md
@@ -32,6 +32,7 @@ Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into possible choices for `multiple_choice`
 - **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -10,6 +10,10 @@ class Sampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
+        self.doc_to_text = self.task.doc_to_text
+        self.doc_to_target = self.task.doc_to_target
+        self.doc_to_choice = self.task.doc_to_choice
        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
            self.docs = self.docs.select(fewshot_indices)
@@ -34,16 +38,29 @@ class Sampler:
            self.fewshot_delimiter.join(
                [
                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
-                    self.task.doc_to_text(doc)
+                    (
+                        self.doc_to_text(doc)
+                        if (
+                            self.config.doc_to_choice is None
+                            or type(self.doc_to_text(doc)) is str
+                        )
+                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
+                    )
                    + self.target_delimiter
-                    + self.task.doc_to_target(doc)
+                    + (
+                        self.doc_to_target(doc)
+                        if (
+                            self.config.doc_to_choice is None
+                            or type(self.doc_to_target(doc)) is str
+                        )
+                        else self.doc_to_choice(doc)[self.doc_to_target(doc)]
+                    )
                    for doc in selected_docs
                ]
            )
            + self.fewshot_delimiter
        )
-        # only returns the fewshot context! Does not append the document, do this outside the object
        return labeled_examples
    def sample(self, n):

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -28,6 +28,7 @@ from lm_eval.api.metrics import (
    mean,
    weighted_perplexity,
    bits_per_byte,
+    metric_max_over_ground_truths,
 )
 from lm_eval.api.registry import (
    get_metric,
@@ -44,7 +45,6 @@ ALL_OUTPUT_TYPES = [
    "multiple_choice",
    "loglikelihood_rolling",
    "greedy_until",
-    "winograd_schema",
 ]
@@ -65,9 +65,10 @@ class TaskConfig(dict):
    fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
-    template_aliases: str = ""
+    template_aliases: Union[str, list] = None
    doc_to_text: Union[Callable, str] = None
    doc_to_target: Union[Callable, str] = None
+    doc_to_choice: Union[Callable, str, dict, list] = None
    gold_alias: Union[Callable, str] = None
    use_prompt: str = None
    description: str = ""
@@ -77,8 +78,6 @@ class TaskConfig(dict):
    num_fewshot: int = 0
    # scoring options
    metric_list: str = None
-    gold_alias: Union[Callable, str] = None
-    create_choices: Union[Callable, str] = None
    output_type: str = "greedy_until"
    generation_kwargs: dict = None
    repeats: int = 1
@@ -317,18 +316,6 @@ class Task(abc.ABC):
        """
        return doc
-    def create_choices(self, doc):
-        if self._config.create_choices is None:
-            return ast.literal_eval(
-                utils.apply_template(
-                    self._config.template_aliases + "{{answer_choices}}", doc
-                )
-            )
-        elif type(self._config.create_choices) == str:
-            return utils.apply_template(self._config.create_choices, doc)
-        else:
-            return self._config.create_choices(doc)
    @property
    def instances(self):
        """After calling `task.build_all_requests()`, tasks
@@ -480,7 +467,10 @@ class Task(abc.ABC):
            )
        example = self.doc_to_text(doc)
+        if type(example) == str:
            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
    def apply_filters(self):
@@ -628,6 +618,40 @@ class ConfigurableTask(Task):
                list(self.fewshot_docs()), self, rnd=random.Random(1234)
            )
+        if self._config.template_aliases is not None:
+            for key, alias in self._config.template_aliases:
+                self.dataset.rename_column(key, alias)
+        if self.has_test_docs():
+            docs = self.test_docs()
+        elif self.has_validation_docs():
+            docs = self.validation_docs()
+        else:
+            assert (
+                False
+            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        # Test One Doc
+        self.features = list(docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self._config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if type(test_choice) is not list:
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if type(test_text) is int:
+                self.multiple_input = num_choice
+        if type(test_target) is list:
+            self.multiple_target = len(test_target)
    def download(self, dataset_kwargs=None):
        self.dataset = datasets.load_dataset(
@@ -683,7 +707,12 @@ class ConfigurableTask(Task):
    def doc_to_decontamination_query(self, doc):
        if self._config.should_decontaminate:
-            return utils.apply_template(self._config.doc_to_decontamination_query, doc)
+            if self._config.doc_to_decontamination_query in self.features:
+                return doc[self._config.doc_to_decontamination_query]
+            else:
+                return ast.literal_eval(
+                    utils.apply_template(self._config.doc_to_decontamination_query, doc)
+                )
    def _process_doc(self, doc):
        """
@@ -703,11 +732,24 @@ class ConfigurableTask(Task):
        else:
            doc_to_text = self._config.doc_to_text
-        if type(doc_to_text) == str:
+        if type(doc_to_text) == int:
-            return utils.apply_template(doc_to_text, doc)
+            return doc_to_text
+        elif type(doc_to_text) == str:
+            if doc_to_text in self.features:
+                # if self._config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit():
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
        elif callable(doc_to_text):
            return doc_to_text(doc)
-        if hasattr(doc_to_text, "apply"):
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
            return doc_to_text.apply(doc)[0]
        else:
            print(type(doc_to_text))
@@ -720,15 +762,50 @@ class ConfigurableTask(Task):
        else:
            doc_to_target = self._config.doc_to_target
-        if type(doc_to_target) == str:
+        if type(doc_to_target) == int:
-            return utils.apply_template(doc_to_target, doc)
+            return doc_to_target
+        elif type(doc_to_target) == str:
+            if doc_to_target in self.features:
+                # if self._config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit():
+                    return ast.literal_eval(target_string)
+                else:
+                    return target_string
        elif callable(doc_to_target):
            return doc_to_target(doc)
+        # Used when applying a Promptsource template
        elif hasattr(doc_to_target, "apply"):
            return doc_to_target.apply(doc)[1]
        else:
            raise TypeError
+    def doc_to_choice(self, doc):
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif self._config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self._config.doc_to_choice
+        if type(doc_to_choice) == str:
+            return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif type(doc_to_choice) == list:
+            return doc_to_choice
+        elif type(doc_to_choice) == dict:
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
    def gold_alias(self, doc):
        # returns a version of the gold target answer to a document,
        # which should be passed into metric for scoring as the ground truth.
@@ -756,19 +833,25 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
            arguments = (self.doc_to_target(doc),)
        elif self.OUTPUT_TYPE == "multiple_choice":
-            # we pass the user-defined answer_choices var (in aliases) and translate the result to a Python list.
-            # TODO: any cleaner way to do this?
+            choices = self.doc_to_choice(doc)
-            choices = self.create_choices(doc)
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                cont = self.doc_to_target(doc)
+                arguments = [(ctx, " {}".format(cont)) for ctx in choices]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, " {}".format(cont)) for cont in choices]
            request_list = [
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " {}".format(choice)),
+                    arguments=arg,
                    idx=i,
                    **kwargs,
                )
-                for i, choice in enumerate(choices)
+                for i, arg in enumerate(arguments)
            ]
            # TODO: we should raise a warning telling users this will at most ~2x runtime.
            if "acc_mutual_info" in self._metric_fn_list.keys():
@@ -795,26 +878,6 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "greedy_until":
            arguments = (ctx, self._config.generation_kwargs)
-        elif self.OUTPUT_TYPE == "winograd_schema":
-            # similar to multiple_choice task type except each request contains
-            # multiple differing contexts with the same continuation
-            contexts = self.create_choices(doc)
-            choice = self.doc_to_target(doc)
-            request_list = [
-                Instance(
-                    request_type="loglikelihood",
-                    doc=doc,
-                    arguments=(context, " {}".format(choice)),
-                    idx=i,
-                    **kwargs,
-                )
-                for i, context in enumerate(contexts)
-            ]
-            return request_list
        return Instance(
            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
        )
@@ -857,13 +920,11 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            lls, is_greedy = zip(*results)
-            if self._config.gold_alias is not None:
-                gold = int(self.gold_alias(doc))
-            else:
-                gold = int(self.doc_to_target(doc))
            # retrieve choices in List[str] form, to compute choice lengths, etc.
-            choices = self.create_choices(doc)
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
            if (
                2 * len(choices) == len(lls)
                and "acc_mutual_info" in self._metric_fn_list.keys()
@@ -876,10 +937,21 @@ class ConfigurableTask(Task):
                lls = lls[::2]
            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
-            acc = 1.0 if np.argmax(lls) == gold else 0.0
+            if self.multiple_input:
-            completion_len = np.array([float(len(i)) for i in choices])
+                gold = self.doc_to_text(doc)
-            acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0
+            else:
+                gold = self.doc_to_target(doc)
+                if type(gold) is str:
+                    gold = choices.index(gold)
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
@@ -900,40 +972,45 @@ class ConfigurableTask(Task):
                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
                result_dict["acc_mutual_info"] = acc_mutual_info
-        elif self.OUTPUT_TYPE == "winograd_schema":
-            lls, is_greedy = zip(*results)
-            if self._config.gold_alias is not None:
-                gold = int(self.gold_alias(doc))
-            else:
-                gold = int(self.doc_to_target(doc))
-            pred = np.argmax(lls)
-            acc = 1.0 if np.argmax(lls) == gold else 0.0
-            result_dict = {
-                **({"acc": acc} if "acc" in use_metric else {}),
-            }
        elif self.OUTPUT_TYPE == "greedy_until":
-            if self._config.gold_alias is not None:
-                gold = self.gold_alias(doc)
-            else:
            gold = self.doc_to_target(doc)
            for key, result in zip(self._metric_fn_list.keys(), results):
-                _dict = self._metric_fn_list[key](
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    for gold_option in gold:
+                        res = self._metric_fn_list[key](
+                            references=[gold_option],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[key],
+                        )
+                        if isinstance(res, dict):
+                            # TODO: this handles the case where HF evaluate returns a dict.
+                            res = res[key]
+                        scores.append(res)
+                    if any(scores):
+                        result = 1.0
+                    else:
+                        result = 0.0
+                else:
+                    result = self._metric_fn_list[key](
                        references=[gold],
                        predictions=[result],
                        **self._metric_fn_kwargs[key],
                    )
-                result_dict = {**result_dict, **_dict}
+                if isinstance(result, dict):
+                    result_dict.update(result)
+                else:
+                    result_dict[key] = result
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until', 'multiple_choice' or 'winograd_schema' ",
+                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
            )
        return result_dict

--- a/lm_eval/tasks/arc/arc_challenge.yaml
+++ b/lm_eval/tasks/arc/arc_challenge.yaml
+include: arc_easy.yaml
 group:
  - ai2_arc
  - multiple_choice
 task: arc_challenge
 dataset_path: ai2_arc
 dataset_name: ARC-Challenge
-output_type: multiple_choice
-training_split: train
-validation_split: validation
-test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
-  # - metric: acc_mutual_info
-  #   aggregation: mean
-  #   higher_is_better: true
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -8,10 +8,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
 doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{choices.label.index(answerKey)}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/glue/qnli/promptsource.yaml
+++ b/lm_eval/tasks/glue/qnli/promptsource.yaml
+group:
+  - glue-promptsource
+task: qnli
+dataset_path: glue
+dataset_name: qnli
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+use_prompt: "promptsource:have all you need"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "Question: {{qtext}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{ra - 1}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{answers|map(attribute='atext')|list}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: query
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -7,10 +7,9 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: null
-template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
 doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{label}}"
-gold_alias: "{{gold}}"
+doc_to_choice: "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
@@ -6,9 +6,8 @@ dataset_name: commonsense
 output_type: multiple_choice
 training_split: train
 test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: "{{input}}\nQuestion: Is this wrong?\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['no', 'yes']
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
-group:
+include: commonsense.yaml
-  - hendrycks_ethics
 task: ethics_deontology
 dataset_path: hails/hendrycks_ethics
 dataset_name: deontology
-output_type: multiple_choice
-training_split: train
-test_split: test
-template_aliases: "{% set answer_choices = ['unreasonable', 'reasonable'] %}{% if excuse is not defined %}{% set excuse = '' %}{% endif %}"
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['unreasonable', 'reasonable']
-metric_list:
-  - metric: acc
 # TODO: implement exact-match metric for this subset
--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -4,5 +4,5 @@ group:
 task: ethics_justice
 dataset_name: justice
 output_type: multiple_choice
+doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
 # TODO: impl. exact match for this and deontology
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_utilitarianism
@@ -6,9 +7,8 @@ dataset_name: utilitarianism
 output_type: multiple_choice
 training_split: train
 test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
-gold_alias: !function utils.gold_alias
+doc_to_choice: ['no', 'yes']
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/hendrycks_ethics/utils.py
+++ b/lm_eval/tasks/hendrycks_ethics/utils.py
@@ -15,23 +15,11 @@ def _preproc_doc(doc):
    return doc
-def _yesno(x):
-    if x:
-        return "yes"
-    else:
-        return "no"
 def doc_to_text(doc):
    doc = _preproc_doc(doc)
    return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"
 def doc_to_target(doc):
-    doc = _preproc_doc(doc)
-    return _yesno(doc["label"])
-def gold_alias(doc):
    doc = _preproc_doc(doc)
    return doc["label"]
--- a/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_virtue
-dataset_path: hails/hendrycks_ethics
 dataset_name: virtue
-output_type: multiple_choice
-training_split: train
-test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['no', 'yes']
-metric_list:
-  - metric: acc
--- a/lm_eval/tasks/mathqa/mathqa.yaml
+++ b/lm_eval/tasks/mathqa/mathqa.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-create_choices: !function utils.create_choices # create list of answer choices
 doc_to_text: "Question: {{Problem}}\nAnswer:"
-doc_to_target: !function utils.doc_to_target
+doc_to_target: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}"
-gold_alias: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}" # this will be cast to an int.
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{Problem}}\nAnswer:"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/mathqa/utils.py
+++ b/lm_eval/tasks/mathqa/utils.py
 import re
-def create_choices(doc):
+def doc_to_choice(doc):
    choices = [
        c[4:].rstrip(" ,")
        for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
    ]
    return choices
-def doc_to_target(doc):
-    choices = create_choices(doc)
-    return choices[["a", "b", "c", "d", "e"].index(doc["correct"])]
--- a/lm_eval/tasks/openbookqa/openbookqa.yaml
+++ b/lm_eval/tasks/openbookqa/openbookqa.yaml
@@ -7,11 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey.lstrip()) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: question_stem
-doc_to_text: "{{question_stem}}"
+doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
-doc_to_target: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{choices.text}}"
 should_decontaminate: true
-doc_to_decontamination_query: "{{question_stem}}"
+doc_to_decontamination_query: question_stem
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: null
-template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is
+doc_to_text: "Question: {{goal}}\nAnswer:"
-doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: label
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_choice: "{{[sol1, sol2]}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: goal
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/prost/corypaik_prost.yaml
+++ b/lm_eval/tasks/prost/corypaik_prost.yaml
@@ -5,10 +5,9 @@ dataset_path: corypaik/prost
 dataset_name: null
 output_type: multiple_choice
 test_split: test
-template_aliases: "{% set answer_choices = [A, B, C, D] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
 doc_to_text: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: label
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{[A, B, C, D]}}"
 should_decontaminate: true
 doc_to_decontamination_query: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
 metric_list:

--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -7,10 +7,9 @@ output_type: multiple_choice
 training_split: null
 validation_split: null
 test_split: train
-template_aliases: "{% set answer_choices = ['yes', 'no', 'maybe'] %}{% set gold = final_decision %}"
 doc_to_text: !function preprocess_pubmedqa.doc_to_text
-doc_to_target: !function preprocess_pubmedqa.doc_to_target
+doc_to_target: final_decision
-gold_alias: !function preprocess_pubmedqa.gold_alias
+doc_to_choice: ["yes", "no", "maybe"]
 metric_list:
  - metric: acc
    aggregation: mean