Merge branch 'big-refactor' into fix-metrics

1cc2a764 · Lintang Sutawika · GitHub · 4b87456d · 3d732e68 · 1cc2a764
Unverified Commit 1cc2a764 authored Aug 21, 2023 by Lintang Sutawika Committed by GitHub Aug 21, 2023
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -50,6 +50,7 @@ jobs:
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
+          cache: 'pip'
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |

--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
-name: Pull Request
-
-on: [pull_request]
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-20.04
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - uses: pre-commit/action@v2.0.3
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -6,10 +6,10 @@ name: Unit Tests
 on:
  push:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  pull_request:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  workflow_dispatch:
 # Jobs run concurrently and steps run sequentially within a job.
 # jobs: linter and cpu_tests. Add more jobs/steps as required.
@@ -26,8 +26,11 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.9
+        cache: 'pip'
    - name: Install dependencies
      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Pre-Commit
+      uses: pre-commit/action@v3.0.0
    - name: Lint with pylint
      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
    - name: Lint with flake8
@@ -52,6 +55,7 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.9
+        cache: 'pip'
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
@@ -60,4 +64,4 @@ jobs:
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ Using this setting helps for massive models like BLOOM which require, or to avoi

 **Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**

-To use `accelerate` with the `lm-eval` command, use 
+To use `accelerate` with the `lm-eval` command, use
 ```
 accelerate launch --no_python lm-eval --model ...
 ```

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -48,7 +48,9 @@ class Sampler:
                    )
                    + self.target_delimiter
                    + (
-                        self.doc_to_target(doc)
+                        self.doc_to_target(doc)[0]
+                        if type(self.doc_to_target(doc)) is list
+                        else self.doc_to_target(doc)
                        if (
                            self.config.doc_to_choice is None
                            or type(self.doc_to_target(doc)) is str

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -465,8 +465,11 @@ class Task(abc.ABC):
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
        elif type(example) == int:
-            choices = self.doc_to_choice(doc)
-            return labeled_examples + choices[example]
+            if self._config.doc_to_choice is not None:
+                choices = self.doc_to_choice(doc)
+                return labeled_examples + choices[example]
+            else:
+                return labeled_examples + str(example)

    def apply_filters(self):

@@ -771,7 +774,7 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError

-    def doc_to_target(self, doc: dict) -> Union[int, str]:
+    def doc_to_target(self, doc: dict) -> Union[int, str, list]:

        if self.prompt is not None:
            doc_to_target = self.prompt
@@ -790,8 +793,16 @@ class ConfigurableTask(Task):
                target_string = utils.apply_template(doc_to_target, doc)
                if target_string.isdigit():
                    return ast.literal_eval(target_string)
+                elif (
+                    len(target_string) >= 2
+                    and (target_string[0] == "[")
+                    and (target_string[-1] == "]")
+                ):
+                    return ast.literal_eval(target_string)
                else:
                    return target_string
+        elif type(doc_to_target) == list:
+            return doc_to_target
        elif callable(doc_to_target):
            return doc_to_target(doc)
        # Used when applying a Promptsource template
@@ -998,9 +1009,13 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "greedy_until":

            gold = self.doc_to_target(doc)
-            if type(gold) == int:
+            if self._config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            else:
+                gold = str(gold)

            for metric in self._metric_fn_list.keys():
                for result in results:
@@ -1020,20 +1035,19 @@ class ConfigurableTask(Task):
                                res = res[metric]
                            scores.append(res)
                        if any(scores):
-                            result = 1.0
+                            result_score = 1.0
                        else:
-                            result = 0.0
+                            result_score = 0.0
                    else:
-                        result = self._metric_fn_list[metric](
+                        result_score = self._metric_fn_list[metric](
                            references=[gold],
                            predictions=[result],
                            **self._metric_fn_kwargs[metric],
                        )
-
-                if isinstance(result, dict):
-                    result_dict.update(result)
-                else:
-                    result_dict[metric] = result
+                        if isinstance(result_score, dict):
+                            # TODO: this handles the case where HF evaluate returns a dict.
+                            result_score = result_score[metric]
+                    result_dict[metric] = result_score
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -8,6 +8,7 @@ FILTER_REGISTRY = {
    "regex": extraction.RegexFilter,
    "majority_vote": selection.MajorityVoteFilter,
    "take_first_k": selection.TakeKFilter,
+    "remove_whitespace": extraction.WhitespaceFilter,
    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -36,3 +36,26 @@ class RegexFilter(Filter):
        # print(filtered_resps)

        return filtered_resps
+
+
+class WhitespaceFilter(Filter):
+    """ """
+
+    def __init__(self):
+        pass
+
+    def apply(self, resps):
+        def filter_set(inst):
+
+            filtered_resp = []
+            for resp in inst:
+                if resp.startswith(" "):
+                    resp = resp[1:]
+
+                filtered_resp.append(resp)
+
+            return filtered_resp
+
+        filtered_resps = [filter_set(resp) for resp in resps]
+
+        return filtered_resps
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -3,7 +3,7 @@ This list keeps track of which tasks' implementations have been ported to YAML /

 Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.

- [ ] Glue (Lintang)
+- [x] Glue
 - [x] SuperGlue
 - [ ] CoQA (Lintang)
 - [ ] DROP (Lintang)
@@ -13,12 +13,12 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] Wikitext
 - [x] PiQA
 - [x] PROST
- [ ] MCTACO (Lintang)
+- [x] MCTACO
 - [x] Pubmed QA
 - [x] SciQ
 - [ ] QASPER
 - [x] QA4MRE
- [ ] TriviaQA (Lintang)
+- [x] TriviaQA
 - [x] AI2 ARC
 - [x] LogiQA
 - [x] HellaSwag
@@ -33,9 +33,9 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] Winogrande
 - [x] ANLI
 - [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1) (Lintang)
- [ ] TruthfulQA (mc2) (Lintang)
- [ ] TruthfulQA (gen) (Lintang)
+- [x] TruthfulQA (mc1)
+- [x] TruthfulQA (mc2)
+- [x] TruthfulQA (gen)
 - [ ] MuTual
 - [ ] Hendrycks Math (Hailey)
 - [ ] Asdiv
@@ -45,16 +45,16 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] Translation (WMT) suite (Hailey)
 - [x] Unscramble
 - [x] ~~Pile (perplexity)~~
- [ ] BLiMP (Lintang)
+- [x] BLiMP
 - [x] ToxiGen
- [ ] StoryCloze (Lintang)
+- [x] StoryCloze
 - [ ] NaturalQs (Hailey)
 - [x] CrowS-Pairs
 - [x] XCopa
 - [ ] BIG-Bench (Hailey)
- [ ] XStoryCloze (Lintang)
+- [x] XStoryCloze
 - [x] XWinograd
- [ ] PAWS-X (Lintang)
+- [x] PAWS-X
 - [x] XNLI
 - [ ] MGSM (Lintang)
 - [ ] SCROLLS

--- a/lm_eval/tasks/anli/README.md
+++ b/lm_eval/tasks/anli/README.md
-# Task-name
+# ANLI

 ### Paper

 Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`

-Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
+Paper Link: https://arxiv.org/abs/1910.14599

 Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
 human-and-model-in-the-loop procedure. It consists of three rounds that progressively
 increase in difficulty and complexity, and each question-answer includes annotator-
 provided explanations.

-Homepage: `https://github.com/facebookresearch/anli`
-
+Homepage: https://github.com/facebookresearch/anli

 ### Citation

@@ -31,13 +30,18 @@ Homepage: `https://github.com/facebookresearch/anli`
 }
 ```

-### Subtasks
+### Groups and Tasks
+
+#### Groups

-List or describe tasks defined in this folder, and their names here:
+* `anli`: Evaluates `anli_r1`, `anli_r2`, and `anli_r3`
+
+#### Tasks
 * `anli_r1`: The data collected adversarially in the first round.
 * `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
 * `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.

+
 ### Checklist

 For adding novel benchmarks/datasets to the library:

--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
 group:
-  - multiple_choice
-  - natural_language_inference
-  - nli
-  - adverserial
+  - anli
 task: anli_r1
 dataset_path: anli
 dataset_name: null

--- a/lm_eval/tasks/anli/anli_r2.yaml
+++ b/lm_eval/tasks/anli/anli_r2.yaml
-group:
-  - multiple_choice
-  - natural_language_inference
-  - nli
-  - adverserial
+include: anli_r1.yaml
 task: anli_r2
-dataset_path: anli
-dataset_name: null
-output_type: multiple_choice
 training_split: train_r2
 validation_split: dev_r2
 test_split: test_r2
-doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
-# True = entailment
-# False = contradiction
-# Neither = neutral
-doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
-doc_to_choice:
-  - "True"
-  - "Neither"
-  - "False"
-should_decontaminate: true
-doc_to_decontamination_query: premise
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/anli/anli_r3.yaml
+++ b/lm_eval/tasks/anli/anli_r3.yaml
-group:
-  - multiple_choice
-  - natural_language_inference
-  - nli
-  - adverserial
+include: anli_r1.yaml
 task: anli_r3
-dataset_path: anli
-dataset_name: null
-output_type: multiple_choice
 training_split: train_r3
 validation_split: dev_r3
 test_split: test_r3
-doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
-# True = entailment
-# False = contradiction
-# Neither = neutral
-doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
-doc_to_choice:
-  - "True"
-  - "Neither"
-  - "False"
-should_decontaminate: true
-doc_to_decontamination_query: premise
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/arc/README.md
+++ b/lm_eval/tasks/arc/README.md
 # ARC

-Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
-https://arxiv.org/pdf/1803.05457.pdf
+### Paper
+
+Title: Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
+
+Abstract: https://arxiv.org/abs/1803.05457

 The ARC dataset consists of 7,787 science exam questions drawn from a variety
 of sources, including science questions provided under license by a research
@@ -13,7 +16,9 @@ a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questi

 Homepage: https://allenai.org/data/arc

+
 ### Citation
+
 ```
 @article{Clark2018ThinkYH,
  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
@@ -23,3 +28,27 @@ Homepage: https://allenai.org/data/arc
  volume={abs/1803.05457}
 }
 ```
+
+### Groups and Tasks
+
+#### Groups
+
+* `ai2_arc`: Evaluates `arc_easy` and `arc_challenge`
+
+#### Tasks
+
+* `arc_easy`
+* `arc_challange`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/arc/arc_challenge.yaml
+++ b/lm_eval/tasks/arc/arc_challenge.yaml
 include: arc_easy.yaml
-group:
-  - ai2_arc
-  - multiple_choice
 task: arc_challenge
-dataset_path: ai2_arc
 dataset_name: ARC-Challenge
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
 group:
  - ai2_arc
-  - multiple_choice
 task: arc_easy
 dataset_path: ai2_arc
 dataset_name: ARC-Easy

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
+# Arithmetic
+
+### Paper
+
+Title: `Language Models are Few-Shot Learners`
+Abstract: https://arxiv.org/abs/2005.14165
+
+A small battery of 10 tests that involve asking language models a simple arithmetic
+problem in natural language.
+
+Homepage: https://github.com/openai/gpt-3/tree/master/data
+
+
+### Citation
+
+```
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `arithmetic`: Evaluates `1dc` to `5ds`
+
+#### Tasks
+
+* `arithmetic_1dc`
+* `arithmetic_2da`
+* `arithmetic_2dm`
+* `arithmetic_2ds`
+* `arithmetic_3da`
+* `arithmetic_3ds`
+* `arithmetic_4da`
+* `arithmetic_4ds`
+* `arithmetic_5da`
+* `arithmetic_5ds`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/babi/README.md
+++ b/lm_eval/tasks/babi/README.md
+# bAbI
+
+### Paper
+
+Title: Towards ai-complete question answering: A set of prerequisite toy tasks
+Abstract: https://arxiv.org/abs/1502.05698
+
+One long-term goal of machine learning research is to produce methods that are applicable to reasoning and natural language, in particular building an intelligent dialogue agent. To measure progress towards that goal, we argue for the usefulness of a set of proxy tasks that evaluate reading comprehension via question answering. Our tasks measure understanding in several ways: whether a system is able to answer questions via chaining facts, simple induction, deduction and many more. The tasks are designed to be prerequisites for any system that aims to be capable of conversing with a human. We believe many existing learning systems can currently not solve them, and hence our aim is to classify these tasks into skill sets, so that researchers can identify (and then rectify) the failings of their systems. We also extend and improve the recently introduced Memory Networks model, and show it is able to solve some, but not all, of the tasks.
+
+Homepage: https://github.com/facebookarchive/bAbI-tasks
+
+
+### Citation
+
+```
+@article{weston2015towards,
+  title={Towards ai-complete question answering: A set of prerequisite toy tasks},
+  author={Weston, Jason and Bordes, Antoine and Chopra, Sumit and Rush, Alexander M and Van Merri{\"e}nboer, Bart and Joulin, Armand and Mikolov, Tomas},
+  journal={arXiv preprint arXiv:1502.05698},
+  year={2015}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet
+
+#### Tasks
+
+* `babi`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
-group:
-  - greedy_until
 task: babi
 dataset_path: Muennighoff/babi
 dataset_name: null

--- a/lm_eval/tasks/crows_pairs/README.md
+++ b/lm_eval/tasks/crows_pairs/README.md
@@ -52,9 +52,15 @@ Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french
 }
 ```

-### Subtasks
+### Groups and Tasks
+
+#### Groups

 - `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
+- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
+
+#### Tasks
+

 The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
 - `crows_pairs_english_age`
@@ -68,8 +74,6 @@ The following tasks evaluate sub-areas of bias in the English CrowS-Pairs datase
 - `crows_pairs_english_sexual_orientation`
 - `crows_pairs_english_socioeconomic`

- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
-
 The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
 - `crows_pairs_french_age`
 - `crows_pairs_french_autre`