Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into benchmark-scripts

Merge branch 'big-refactor' of...
Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into benchmark-scripts
b7cd829b · lintangsutawika · 2d96a8c8 · 4e44f0aa · b7cd829b · 2d96a8c8
Commit b7cd829b authored Jul 27, 2023 by lintangsutawika
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
+name: Tasks Modified
+
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v37.1.2
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
+
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Build
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Cache
-      uses: actions/cache@v2.1.3
-      with:
-        # A list of files, directories, and wildcard patterns to cache and restore
-        path: |
-          ~/.cache
-        # An explicit key for restoring and saving the cache
-        key: evaldata-cache-4
-    - name: Set up Python 3.9
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest pytest-cov
-        pip install -e .[dev,multilingual]
-        # Install optional git dependencies
-        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest -vv --cov=lm_eval/ tests/
-    - name: Upload to codecov
-      run: |
-        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# just comment out unwanted steps to turn off the test.
+name: Unit Tests
+
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+# Jobs run concurrently and steps run sequentially within a job.
+# jobs: linter and cpu_tests. Add more jobs/steps as required.
+jobs:
+  linter:
+    name: Linters
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Lint with pylint
+      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      # mypy turned off for now
+#    - name: Lint with mypy
+#      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+# Job 2
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+        #         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,15 @@ output/
 data/
 lm_cache
 .idea
-
-*.egg-info/
+build
+dist
+*.egg-info
+venv
 .vscode/
+temp
+__pycache__
+.ipynb_checkpoints
+temp
+# IPython
+profile_default/
+ipython_config.py
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -472,6 +472,9 @@ class Task(abc.ABC):
            return labeled_examples + example
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            choices = self.doc_to_choice(doc)
+            return labeled_examples + choices[example]

    def apply_filters(self):

@@ -950,22 +953,21 @@ class ConfigurableTask(Task):
            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold])

            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
            }

-            if "exact_match" in self._metric_fn_list.keys():
-                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                is_greedy = is_greedy[gold]  # take value for the gold answer
-                result_dict["exact_match"] = int(is_greedy)
-
            if "acc_mutual_info" in use_metric:
                lls_mutual_info = [
                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -244,10 +244,7 @@ def evaluate(
        # aggregate Instances by LM method requested to get output.
        reqtype = (
            "loglikelihood"
-            if (
-                task.OUTPUT_TYPE == "multiple_choice"
-                or task.OUTPUT_TYPE == "winograd_schema"
-            )
+            if task.OUTPUT_TYPE == "multiple_choice"
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)

--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model
 @register_model("dummy")
 class DummyLM(LM):
    def __init__(self):
-        pass
+        super().__init__()

    @classmethod
    def create_from_arg_string(cls, arg_string, additional_config=None):

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -723,7 +723,7 @@ class HFLM(LM):
                else:
                    max_gen_toks = self.max_gen_toks
                # first stop sequence is used to halt generation upon encountering
-                (primary_until) = until[0]
+                primary_until = [until[0]]

                # set the max length in tokens of inputs ("context_enc")
                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:

--- a/lm_eval/tasks/webqs/README.md
+++ b/lm_eval/tasks/webqs/README.md
+# Task-name
+
+### Paper
+
+Title: `Semantic Parsing on Freebase from Question-Answer Pairs`
+
+Abstract: `https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf`
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a`
+
+
+### Citation
+
+```
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `webqs`: `Questions with multiple accepted answers.`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/webqs/utils.py
+++ b/lm_eval/tasks/webqs/utils.py
+from typing import Dict, List
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return all of the accepted answers as choices."""
+    return _remove_prefixes(doc["answers"])
+
+
+def doc_to_target(doc: Dict) -> List[int]:
+    """Return list of indices of accepted answers (all of them)."""
+    remaining = _remove_prefixes(doc["answers"])
+    return list(range(len(remaining)))
+
+
+def _remove_prefixes(aliases):
+    """
+    Remove any alias that has a strict prefix elsewhere in the list.
+
+    This is an optimization. We can do this because if the prefix is acceptable by isgreedy,
+    we can stop looking.
+    """
+    aliases.sort()
+    ret = [aliases[0]]
+    for alias in aliases[1:]:
+        if not alias.startswith(ret[-1]):
+            ret.append(alias)
+    return ret
--- a/lm_eval/tasks/webqs/webqs.yaml
+++ b/lm_eval/tasks/webqs/webqs.yaml
+group:
+  - freebase
+  - question_answer
+task: webqs
+dataset_path: web_questions
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: null
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: question
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xwinograd/README.md
+++ b/lm_eval/tasks/xwinograd/README.md
+# Task-name
+
+### Paper
+
+Title: `It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning`
+Abstract: `https://arxiv.org/abs/2106.12066`
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`
+
+
+### Citation
+
+```
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `xwinograd_en`: Winograd schema challenges in English.
+* `xwinograd_fr`: Winograd schema challenges in French.
+* `xwinograd_jp`: Winograd schema challenges in Japanese.
+* `xwinograd_pt`: Winograd schema challenges in Portuguese.
+* `xwinograd_ru`: Winograd schema challenges in Russian.
+* `xwinograd_zh`: Winograd schema challenges in Chinese.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+
+# Different languages that are part of xwinograd.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+LANGUAGES = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def doc_to_text(doc: Dict) -> int:
+    """
+    Return index of the correct choice.
+
+    Note: We are using the "multiple input" mode of the multiple-choice
+        output-type, which means we use different contexts with the same target
+        for the different choices, rather than the same context and different targets.
+    """
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def doc_to_target(doc: Dict) -> str:
+    """
+    Return the target completion.
+
+    Note that this does not depend on the correct choice as we are using
+    "multiple input" mode.
+    """
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return the choices that will be used as contexts in "multiple input" mode."""
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES:
+        file_name = f"xwinograd_{lang}.yaml"
+        try:
+            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "xwinograd_common_yaml",
+                        "dataset_name": lang,
+                        "task": f"xwinograd_{lang}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group:
+  - winograd
+  - commonsense
+  - multilingual
+dataset_path: Muennighoff/xwinograd
+dataset_name: null  # Overridden by language-specific config.
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+# Generated by utils.py
+dataset_name: en
+include: xwinograd_common_yaml
+task: xwinograd_en
--- a/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+include: xwinograd_common_yaml
+task: xwinograd_fr
--- a/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+# Generated by utils.py
+dataset_name: jp
+include: xwinograd_common_yaml
+task: xwinograd_jp
--- a/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+# Generated by utils.py
+dataset_name: pt
+include: xwinograd_common_yaml
+task: xwinograd_pt
--- a/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+include: xwinograd_common_yaml
+task: xwinograd_ru
--- a/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+include: xwinograd_common_yaml
+task: xwinograd_zh