Merge remote-tracking branch 'upstream/big-refactor' into big-refactor-autobatching

6a6a0ebb · Benjamin Fattori · e4acfcaa · 2820042d · 6a6a0ebb · e4acfcaa
Commit 6a6a0ebb authored Jul 22, 2023 by Benjamin Fattori
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
+name: Tasks Modified
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+      # Example 1
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v37.1.2
+        with:
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            #         Install optional git dependencies
+    #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/test_api.py -s -vv -n=auto --new_task
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-name: Build
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-    - name: Cache
-      uses: actions/cache@v2.1.3
-      with:
-        # A list of files, directories, and wildcard patterns to cache and restore
-        path: |
-          ~/.cache
-        # An explicit key for restoring and saving the cache
-        key: evaldata-cache-4
-    - name: Set up Python 3.9
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest pytest-cov
-        pip install -e .[dev,multilingual]
-        # Install optional git dependencies
-        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest -vv --cov=lm_eval/ tests/
-    - name: Upload to codecov
-      run: |
-        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+name: Unit Tests
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+jobs:
+  linter:
+    name: Linters
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Lint with pylint
+      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Lint with mypy
+      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+        #         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
--- a/docs/advanced_task_guide.md
+++ b/docs/advanced_task_guide.md
@@ -32,6 +32,7 @@ Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into possible choices for `multiple_choice`
 - **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,13 +4,13 @@ from typing import Literal, Tuple
 @dataclass
 class Instance:
-    request_type: str = Literal[
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
-        "loglikelihood", "loglikelihood_rolling", "greedy_until"
+    doc: dict
-    ]
+    arguments: tuple
-    doc: dict = None
+    idx: int
-    arguments: tuple = None
+    metadata: Tuple[str, int, int] = field(
-    idx: int = None
+        default_factory=lambda: (None, None, None)
-    metadata: tuple = Tuple[str, int, int]  # TODO: better typehints here
+    )  # TODO: better typehints here
    resps: list = field(default_factory=list)
    filtered_resps: dict = field(default_factory=dict)

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -114,6 +114,8 @@ class LM(abc.ABC):
        additional_config = {} if additional_config is None else additional_config
        args = utils.simple_parse_args_string(arg_string)
        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        if args2.get("device") == "mps" or args.get("device") == "mps":
+            args["dtype"] = "float32"
        return cls(**args, **args2)
    @property

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -10,6 +10,10 @@ class Sampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
+        self.doc_to_text = self.task.doc_to_text
+        self.doc_to_target = self.task.doc_to_target
+        self.doc_to_choice = self.task.doc_to_choice
        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
            self.docs = self.docs.select(fewshot_indices)
@@ -34,16 +38,29 @@ class Sampler:
            self.fewshot_delimiter.join(
                [
                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
-                    self.task.doc_to_text(doc)
+                    (
+                        self.doc_to_text(doc)
+                        if (
+                            self.config.doc_to_choice is None
+                            or type(self.doc_to_text(doc)) is str
+                        )
+                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
+                    )
                    + self.target_delimiter
-                    + self.task.doc_to_target(doc)
+                    + (
+                        self.doc_to_target(doc)
+                        if (
+                            self.config.doc_to_choice is None
+                            or type(self.doc_to_target(doc)) is str
+                        )
+                        else self.doc_to_choice(doc)[self.doc_to_target(doc)]
+                    )
                    for doc in selected_docs
                ]
            )
            + self.fewshot_delimiter
        )
-        # only returns the fewshot context! Does not append the document, do this outside the object
        return labeled_examples
    def sample(self, n):

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -8,6 +8,7 @@ import evaluate
 import random
 import itertools
 import functools
+from tqdm import tqdm
 import datasets
 import numpy as np
@@ -27,6 +28,7 @@ from lm_eval.api.metrics import (
    mean,
    weighted_perplexity,
    bits_per_byte,
+    metric_max_over_ground_truths,
 )
 from lm_eval.api.registry import (
    get_metric,
@@ -43,7 +45,6 @@ ALL_OUTPUT_TYPES = [
    "multiple_choice",
    "loglikelihood_rolling",
    "greedy_until",
-    "winograd_schema",
 ]
@@ -64,9 +65,10 @@ class TaskConfig(dict):
    fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
-    template_aliases: str = ""
+    template_aliases: Union[str, list] = None
    doc_to_text: Union[Callable, str] = None
    doc_to_target: Union[Callable, str] = None
+    doc_to_choice: Union[Callable, str, dict, list] = None
    gold_alias: Union[Callable, str] = None
    use_prompt: str = None
    description: str = ""
@@ -76,8 +78,6 @@ class TaskConfig(dict):
    num_fewshot: int = 0
    # scoring options
    metric_list: str = None
-    gold_alias: Union[Callable, str] = None
-    create_choices: Union[Callable, str] = None
    output_type: str = "greedy_until"
    generation_kwargs: dict = None
    repeats: int = 1
@@ -217,8 +217,8 @@ class Task(abc.ABC):
                self._filters.append(filter_pipeline)
        self.sampler = samplers.Sampler(
-            list(self.fewshot_docs()), self, rnd=random.Random()
+            list(self.fewshot_docs()), self, rnd=random.Random(1234)
-        )  # TODO: pass the correct docs in here
+        )
    def download(self, data_dir=None, cache_dir=None, download_mode=None):
        """Downloads and returns the task dataset.
@@ -316,18 +316,6 @@ class Task(abc.ABC):
        """
        return doc
-    def create_choices(self, doc):
-        if self._config.create_choices is None:
-            return ast.literal_eval(
-                utils.apply_template(
-                    self._config.template_aliases + "{{answer_choices}}", doc
-                )
-            )
-        elif type(self._config.create_choices) == str:
-            return utils.apply_template(self._config.create_choices, doc)
-        else:
-            return self._config.create_choices(doc)
    @property
    def instances(self):
        """After calling `task.build_all_requests()`, tasks
@@ -366,13 +354,18 @@ class Task(abc.ABC):
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        eval_logger.info(
+            f"Building contexts for task '{self._config.task}' on rank {rank}..."
+        )
        instances = []
        for doc_id, doc in utils.create_iterator(
            enumerate(docs), rank, world_size, limit
        ):
            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
-                doc, self._config.num_fewshot, rnd=random.Random()
+                doc,
+                self._config.num_fewshot,
            )
            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
@@ -453,7 +446,7 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))
    @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot, rnd=None):
+    def fewshot_context(self, doc, num_fewshot):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -461,15 +454,9 @@ class Task(abc.ABC):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
-        :param rnd: random.Random
-            The pseudo-random number generator used to randomly sample examples.
-            WARNING: This is currently a required arg although it's optionalized with a default `None`.
        :returns: str
            The fewshot context.
        """
-        assert (
-            rnd is not None
-        ), "A `random.Random` generator argument must be provided to `rnd`"
        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
@@ -480,7 +467,10 @@ class Task(abc.ABC):
            )
        example = self.doc_to_text(doc)
-        return labeled_examples + example
+        if type(example) == str:
+            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
    def apply_filters(self):
@@ -625,9 +615,43 @@ class ConfigurableTask(Task):
        if self.fewshot_docs() is not None:
            self.sampler = samplers.Sampler(
-                list(self.fewshot_docs()), self, rnd=random.Random()
+                list(self.fewshot_docs()), self, rnd=random.Random(1234)
            )
+        if self._config.template_aliases is not None:
+            for key, alias in self._config.template_aliases:
+                self.dataset.rename_column(key, alias)
+        if self.has_test_docs():
+            docs = self.test_docs()
+        elif self.has_validation_docs():
+            docs = self.validation_docs()
+        else:
+            assert (
+                False
+            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        # Test One Doc
+        self.features = list(docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self._config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if type(test_choice) is not list:
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if type(test_text) is int:
+                self.multiple_input = num_choice
+        if type(test_target) is list:
+            self.multiple_target = len(test_target)
    def download(self, dataset_kwargs=None):
        self.dataset = datasets.load_dataset(
@@ -683,7 +707,12 @@ class ConfigurableTask(Task):
    def doc_to_decontamination_query(self, doc):
        if self._config.should_decontaminate:
-            return utils.apply_template(self._config.doc_to_decontamination_query, doc)
+            if self._config.doc_to_decontamination_query in self.features:
+                return doc[self._config.doc_to_decontamination_query]
+            else:
+                return ast.literal_eval(
+                    utils.apply_template(self._config.doc_to_decontamination_query, doc)
+                )
    def _process_doc(self, doc):
        """
@@ -703,11 +732,24 @@ class ConfigurableTask(Task):
        else:
            doc_to_text = self._config.doc_to_text
-        if type(doc_to_text) == str:
+        if type(doc_to_text) == int:
-            return utils.apply_template(doc_to_text, doc)
+            return doc_to_text
+        elif type(doc_to_text) == str:
+            if doc_to_text in self.features:
+                # if self._config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit():
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
        elif callable(doc_to_text):
            return doc_to_text(doc)
-        if hasattr(doc_to_text, "apply"):
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
            return doc_to_text.apply(doc)[0]
        else:
            print(type(doc_to_text))
@@ -720,15 +762,50 @@ class ConfigurableTask(Task):
        else:
            doc_to_target = self._config.doc_to_target
-        if type(doc_to_target) == str:
+        if type(doc_to_target) == int:
-            return utils.apply_template(doc_to_target, doc)
+            return doc_to_target
+        elif type(doc_to_target) == str:
+            if doc_to_target in self.features:
+                # if self._config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit():
+                    return ast.literal_eval(target_string)
+                else:
+                    return target_string
        elif callable(doc_to_target):
            return doc_to_target(doc)
+        # Used when applying a Promptsource template
        elif hasattr(doc_to_target, "apply"):
            return doc_to_target.apply(doc)[1]
        else:
            raise TypeError
+    def doc_to_choice(self, doc):
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif self._config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self._config.doc_to_choice
+        if type(doc_to_choice) == str:
+            return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif type(doc_to_choice) == list:
+            return doc_to_choice
+        elif type(doc_to_choice) == dict:
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
    def gold_alias(self, doc):
        # returns a version of the gold target answer to a document,
        # which should be passed into metric for scoring as the ground truth.
@@ -756,19 +833,25 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
            arguments = (self.doc_to_target(doc),)
        elif self.OUTPUT_TYPE == "multiple_choice":
-            # we pass the user-defined answer_choices var (in aliases) and translate the result to a Python list.
-            # TODO: any cleaner way to do this?
+            choices = self.doc_to_choice(doc)
-            choices = self.create_choices(doc)
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                cont = self.doc_to_target(doc)
+                arguments = [(ctx, " {}".format(cont)) for ctx in choices]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, " {}".format(cont)) for cont in choices]
            request_list = [
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " {}".format(choice)),
+                    arguments=arg,
                    idx=i,
                    **kwargs,
                )
-                for i, choice in enumerate(choices)
+                for i, arg in enumerate(arguments)
            ]
            # TODO: we should raise a warning telling users this will at most ~2x runtime.
            if "acc_mutual_info" in self._metric_fn_list.keys():
@@ -795,26 +878,6 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "greedy_until":
            arguments = (ctx, self._config.generation_kwargs)
-        elif self.OUTPUT_TYPE == "winograd_schema":
-            # similar to multiple_choice task type except each request contains
-            # multiple differing contexts with the same continuation
-            contexts = self.create_choices(doc)
-            choice = self.doc_to_target(doc)
-            request_list = [
-                Instance(
-                    request_type="loglikelihood",
-                    doc=doc,
-                    arguments=(context, " {}".format(choice)),
-                    idx=i,
-                    **kwargs,
-                )
-                for i, context in enumerate(contexts)
-            ]
-            return request_list
        return Instance(
            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
        )
@@ -857,13 +920,11 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            lls, is_greedy = zip(*results)
-            if self._config.gold_alias is not None:
-                gold = int(self.gold_alias(doc))
-            else:
-                gold = int(self.doc_to_target(doc))
            # retrieve choices in List[str] form, to compute choice lengths, etc.
-            choices = self.create_choices(doc)
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
            if (
                2 * len(choices) == len(lls)
                and "acc_mutual_info" in self._metric_fn_list.keys()
@@ -876,23 +937,33 @@ class ConfigurableTask(Task):
                lls = lls[::2]
            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
-            acc = 1.0 if np.argmax(lls) == gold else 0.0
+            if self.multiple_input:
-            completion_len = np.array([float(len(i)) for i in choices])
+                gold = self.doc_to_text(doc)
-            acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0
+            else:
+                gold = self.doc_to_target(doc)
+                if type(gold) is str:
+                    gold = choices.index(gold)
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold])
            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
            }
-            if "exact_match" in self._metric_fn_list.keys():
-                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                is_greedy = is_greedy[gold]  # take value for the gold answer
-                result_dict["exact_match"] = int(is_greedy)
            if "acc_mutual_info" in use_metric:
                lls_mutual_info = [
                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
@@ -900,40 +971,45 @@ class ConfigurableTask(Task):
                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
                result_dict["acc_mutual_info"] = acc_mutual_info
-        elif self.OUTPUT_TYPE == "winograd_schema":
-            lls, is_greedy = zip(*results)
-            if self._config.gold_alias is not None:
-                gold = int(self.gold_alias(doc))
-            else:
-                gold = int(self.doc_to_target(doc))
-            pred = np.argmax(lls)
-            acc = 1.0 if np.argmax(lls) == gold else 0.0
-            result_dict = {
-                **({"acc": acc} if "acc" in use_metric else {}),
-            }
        elif self.OUTPUT_TYPE == "greedy_until":
-            if self._config.gold_alias is not None:
+            gold = self.doc_to_target(doc)
-                gold = self.gold_alias(doc)
-            else:
-                gold = self.doc_to_target(doc)
            for key, result in zip(self._metric_fn_list.keys(), results):
-                _dict = self._metric_fn_list[key](
+                if self.multiple_target:
-                    references=[gold],
+                    # in the case where we have multiple targets,
-                    predictions=[result],
+                    # return true if any are true
-                    **self._metric_fn_kwargs[key],
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
-                )
+                    scores = []
+                    for gold_option in gold:
+                        res = self._metric_fn_list[key](
+                            references=[gold_option],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[key],
+                        )
+                        if isinstance(res, dict):
+                            # TODO: this handles the case where HF evaluate returns a dict.
+                            res = res[key]
+                        scores.append(res)
+                    if any(scores):
+                        result = 1.0
+                    else:
+                        result = 0.0
+                else:
+                    result = self._metric_fn_list[key](
+                        references=[gold],
+                        predictions=[result],
+                        **self._metric_fn_kwargs[key],
+                    )
-                result_dict = {**result_dict, **_dict}
+                if isinstance(result, dict):
+                    result_dict.update(result)
+                else:
+                    result_dict[key] = result
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until', 'multiple_choice' or 'winograd_schema' ",
+                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
            )
        return result_dict
@@ -1004,13 +1080,10 @@ class PerplexityTask(Task):
        assert k == 0
        return []
-    def fewshot_context(self, doc, num_fewshot, rnd=None):
+    def fewshot_context(self, doc, num_fewshot):
        assert (
            num_fewshot == 0
        ), "The number of fewshot examples must be 0 for perplexity tasks."
-        assert (
-            rnd is not None
-        ), "A `random.Random` generator argument must be provided to `rnd`."
        return ""

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -45,6 +45,7 @@ def simple_evaluate(
    check_integrity=False,
    decontamination_ngrams_path=None,
    write_out=False,
+    log_samples=True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -72,12 +73,17 @@ def simple_evaluate(
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
    :param write_out: bool
-        If True, write details about prompts and logits to json for all tasks
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :return
        Dictionary of results
    """
-    random.seed(1234)
+    random.seed(0)
    np.random.seed(1234)
+    torch.manual_seed(
+        1234
+    )  # TODO: this may affect training runs that are run with evaluation mid-run.
    assert tasks != [], "No tasks specified"
@@ -118,6 +124,7 @@ def simple_evaluate(
        bootstrap_iters=bootstrap_iters,
        decontamination_ngrams_path=decontamination_ngrams_path,
        write_out=write_out,
+        log_samples=log_samples,
    )
    if lm.rank == 0:
@@ -154,6 +161,7 @@ def evaluate(
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
    write_out=False,
+    log_samples=True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -168,7 +176,9 @@ def evaluate(
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param write_out: bool
-        If True, write all prompts, logits and metrics to json for offline analysis
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :return
        Dictionary of results
    """
@@ -213,7 +223,10 @@ def evaluate(
        # aggregate Instances by LM method requested to get output.
        reqtype = (
            "loglikelihood"
-            if (task.OUTPUT_TYPE == "multiple_choice" or task.OUTPUT_TYPE == "winograd_schema") 
+            if (
+                task.OUTPUT_TYPE == "multiple_choice"
+                or task.OUTPUT_TYPE == "winograd_schema"
+            )
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
@@ -279,17 +292,18 @@ def evaluate(
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
-                target = task.doc_to_target(doc)
+                if log_samples:
-                example = {
+                    target = task.doc_to_target(doc)
-                    "doc_id": doc_id,
+                    example = {
-                    "doc": doc,
+                        "doc_id": doc_id,
-                    "target": target,
+                        "doc": doc,
-                    "arguments": requests[0].args,
+                        "target": target,
-                    "resps": [req.resps for req in requests],
+                        "arguments": [req.args for req in requests],
-                    "filtered_resps": [req.filtered_resps[key] for req in requests],
+                        "resps": [req.resps for req in requests],
-                }
+                        "filtered_resps": [req.filtered_resps[key] for req in requests],
-                example.update(metrics)
+                    }
-                samples[task_name].append(example)
+                    example.update(metrics)
+                    samples[task_name].append(example)
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
@@ -359,12 +373,15 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-        return {
+        results_dict = {
            "results": dict(results),
            "configs": dict(configs),
            "versions": dict(versions),
-            "samples": samples,
        }
+        if log_samples:
+            results_dict["samples"] = dict(samples)
+        return results_dict
    else:
        return None
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model
 @register_model("dummy")
 class DummyLM(LM):
    def __init__(self):
-        pass
+        super().__init__()
    @classmethod
    def create_from_arg_string(cls, arg_string, additional_config=None):

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -71,6 +71,7 @@ class HFLM(LM):
        max_batch_size: Optional[int] = 64,
        low_cpu_mem_usage: Optional[bool] = True,
        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -99,7 +100,7 @@ class HFLM(LM):
        if not (parallelize or accelerator.num_processes > 1):
            # use user-passed device
            device_list = set(
-                ["cuda", "cpu"]
+                ["cuda", "cpu", "mps"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
            )
            if device:
@@ -107,6 +108,10 @@ class HFLM(LM):
                    device = int(device)
                self._device = torch.device(device)
                eval_logger.info(f"Using device '{device}'")
+                if device == "mps":
+                    eval_logger.info(
+                        "MPS is still in beta and only supports float32; setting dtype to float32."
+                    )
            else:
                eval_logger.info("Device not specified")
                eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
@@ -217,6 +222,7 @@ class HFLM(LM):
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
        )
        self.vocab_size = self.tokenizer.vocab_size

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -24,21 +24,18 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] HellaSwag
 - [x] SWAG
 - [x] OpenBookQA
- [x] RACE
- [ ] LogiQA (WIP)
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
 - [ ] SQuADv2 (WIP)
 - [x] RACE
- [x] HeadQA (WIP)
+- [x] HeadQA
 - [ ] MathQA (WIP)
 - [ ] WebQs
 - [ ] WSC273
 - [x] Winogrande
 - [x] ANLI
 - [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [ ] TruthfulQA
+- [x] TruthfulQA (mc1)
+- [ ] TruthfulQA (mc2)
+- [ ] TruthfulQA (gen)
 - [ ] MuTual
 - [ ] Hendrycks Math (WIP)
 - [ ] Asdiv (WIP)
@@ -51,7 +48,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] BLiMP
 - [x] ToxiGen
 - [ ] StoryCloze
- [ ] NaturalQs
+- [ ] NaturalQs (WIP)
 - [ ] CrowS-Pairs
 - [ ] XCopa
 - [ ] BIG-Bench

--- a/lm_eval/tasks/anli/README.md
+++ b/lm_eval/tasks/anli/README.md
+# Task-name
+### Paper
+Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
+Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
+Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
+human-and-model-in-the-loop procedure. It consists of three rounds that progressively
+increase in difficulty and complexity, and each question-answer includes annotator-
+provided explanations.
+Homepage: `https://github.com/facebookresearch/anli`
+### Citation
+```
+@inproceedings{nie-etal-2020-adversarial,
+    title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
+    author = "Nie, Yixin  and
+      Williams, Adina  and
+      Dinan, Emily  and
+      Bansal, Mohit  and
+      Weston, Jason  and
+      Kiela, Douwe",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+### Subtasks
+List or describe tasks defined in this folder, and their names here:
+* `anli_r1`: The data collected adversarially in the first round.
+* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
+* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
+### Checklist
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
+group:
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
+task: anli_r1
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
+training_split: train_r1
+validation_split: dev_r1
+test_split: test_r1
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/anli/anli_r2.yaml
+++ b/lm_eval/tasks/anli/anli_r2.yaml
+group:
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
+task: anli_r2
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
+training_split: train_r2
+validation_split: dev_r2
+test_split: test_r2
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/anli/anli_r3.yaml
+++ b/lm_eval/tasks/anli/anli_r3.yaml
+group:
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
+task: anli_r3
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
+training_split: train_r3
+validation_split: dev_r3
+test_split: test_r3
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/arc/arc_challenge.yaml
+++ b/lm_eval/tasks/arc/arc_challenge.yaml
+include: arc_easy.yaml
 group:
  - ai2_arc
  - multiple_choice
 task: arc_challenge
 dataset_path: ai2_arc
 dataset_name: ARC-Challenge
-output_type: multiple_choice
-training_split: train
-validation_split: validation
-test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
-  # - metric: acc_mutual_info
-  #   aggregation: mean
-  #   higher_is_better: true
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -8,10 +8,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
 doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{choices.label.index(answerKey)}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/glue/qnli/promptsource.yaml
+++ b/lm_eval/tasks/glue/qnli/promptsource.yaml
+group:
+  - glue-promptsource
+task: qnli
+dataset_path: glue
+dataset_name: qnli
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+use_prompt: "promptsource:have all you need"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "Question: {{qtext}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{ra - 1}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{answers|map(attribute='atext')|list}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: query
 metric_list:
  - metric: acc
    aggregation: mean