Merge branch 'big-refactor' into bump-deps

fa2ae334 · Hailey Schoelkopf · GitHub · 7c2687cb · 54a53d6f · fa2ae334
Unverified Commit fa2ae334 authored Sep 19, 2023 by Hailey Schoelkopf Committed by GitHub Sep 19, 2023
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
-name: Tasks Modified
+# name: Tasks Modified
-on:
+# on:
-  push:
+#   push:
-    branches:
+#     branches:
-      - 'big-refactor*'
+#       - 'big-refactor*'
-  pull_request:
+#   pull_request:
-    branches:
+#     branches:
-      - 'big-refactor*'
+#       - 'big-refactor*'
-  workflow_dispatch:
+#   workflow_dispatch:
-# comment/edit out the above to stop/change the triggers
+# # comment/edit out the above to stop/change the triggers
-jobs:
+# jobs:
-  changed_files:
+#   changed_files:
-    runs-on: ubuntu-latest  # windows-latest || macos-latest
+#     runs-on: ubuntu-latest  # windows-latest || macos-latest
-    timeout-minutes: 120
+#     timeout-minutes: 120
-    name: Scan for changed tasks
+#     name: Scan for changed tasks
-    steps:
+#     steps:
-      - name: checkout
+#       - name: checkout
-        uses: actions/checkout@v3
+#         uses: actions/checkout@v3
-        with:
+#         with:
-          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+#           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
-      # Uses the tj-actions/changed-files@v37 action to check for changes.
+#       # Uses the tj-actions/changed-files@v37 action to check for changes.
-      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+#       # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
-      # The `files_yaml` input optionally takes a yaml string to specify filters,
+#       # The `files_yaml` input optionally takes a yaml string to specify filters,
-      # and prepends the filter name to the standard output names.
+#       # and prepends the filter name to the standard output names.
-      - name: Check task folders
+#       - name: Check task folders
-        id: changed-tasks
+#         id: changed-tasks
-        uses: tj-actions/changed-files@v37.1.2
+#         uses: tj-actions/changed-files@v37.1.2
-        with:
+#         with:
-          # tasks checks the tasks folder and api checks the api folder for changes
+#           # tasks checks the tasks folder and api checks the api folder for changes
-          files_yaml: |
+#           files_yaml: |
-            tasks:
+#             tasks:
-              - lm_eval/tasks/**
+#               - lm_eval/tasks/**
-            api:
+#             api:
-              - lm_eval/api/**
+#               - lm_eval/api/**
-          write_output_files: true
+#           write_output_files: true
-    # The next step is optional; the files are written to the workspace by default (above).
+#     # The next step is optional; the files are written to the workspace by default (above).
-    # so it's just for debugging
+#     # so it's just for debugging
-      - name: Run Tests
+#       - name: Run Tests
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: |
+#         run: |
-          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+#           echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
-          echo "One or more test file(s) has changed."
+#           echo "One or more test file(s) has changed."
-          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+#           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
-      - name: Set up Python 3.9
+#       - name: Set up Python 3.9
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v4
+#         uses: actions/setup-python@v4
-        with:
+#         with:
-          python-version: 3.9
+#           python-version: 3.9
-          cache: 'pip'
+#           cache: 'pip'
-          cache-dependency-path: setup.py
+#           cache-dependency-path: setup.py
-      - name: Install dependencies
+#       - name: Install dependencies
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: |
+#         run: |
-            python -m pip install --upgrade pip
+#             python -m pip install --upgrade pip
-            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+#             pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    #   Install optional git dependencies
+#     #   Install optional git dependencies
-    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+#     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Test with pytest
+#       - name: Test with pytest
-        # if new tasks are added, run tests on them
+#         # if new tasks are added, run tests on them
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+#         run: python -m pytest tests/test_tasks.py -s -vv
-        # if api is modified, run tests on it
+#         # if api is modified, run tests on it
-      - name: Test more tasks with pytest
+#       - name: Test more tasks with pytest
-        env:
+#         env:
-          API: true
+#           API: true
-        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+#         if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+#         run: python -m pytest tests/test_tasks.py -s -vv
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -40,39 +40,38 @@ jobs:
        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-      # mypy turned off for now
+#       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
 # Job 2
-  testcpu:
+#   testcpu:
-    name: CPU Tests
+#     name: CPU Tests
-    runs-on: ubuntu-latest
+#     runs-on: ubuntu-latest
-    strategy:
+#     strategy:
-      matrix:
+#       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11" ]
+#         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-    timeout-minutes: 30
+#     timeout-minutes: 30
+#     steps:
-    steps:
+#     - name: Checkout Code
-    - name: Checkout Code
+#       uses: actions/checkout@v3
-      uses: actions/checkout@v3
+#     - name: Set up Python ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
+#       uses: actions/setup-python@v4
-      uses: actions/setup-python@v4
+#       with:
-      with:
+#         python-version: ${{ matrix.python-version }}
-        python-version: ${{ matrix.python-version }}
+#         cache: pip
-        cache: pip
+#         cache-dependency-path: setup.py
-        cache-dependency-path: setup.py
+#     - name: Install dependencies
-    - name: Install dependencies
+#       run: |
-      run: |
+#         python -m pip install --upgrade pip
-        python -m pip install --upgrade pip
+#         pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+# #         Install optional git dependencies
-#         Install optional git dependencies
+# #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+# #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+#     - name: Test with pytest
-    - name: Test with pytest
+#       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
-      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+#     - name: Archive artifacts
-    - name: Archive artifacts
+#       uses: actions/upload-artifact@v3
-      uses: actions/upload-artifact@v3
+#       with:
-      with:
+#         name: output_results
-        name: output_results
+#         path: |
-        path: |
+#           test_logs/*
-          test_logs/*
--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
-import os
-import yaml
-from lm_eval import utils
-from lm_eval.tasks import register_configurable_task, check_prompt_config
-from lm_eval.logger import eval_logger
-from lm_eval.api.registry import (
-    TASK_REGISTRY,
-    GROUP_REGISTRY,
-    ALL_TASKS,
-)
-def include_benchmarks(task_dir: str) -> None:
-    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    try:
-                        benchmark_path = os.path.join(root, f)
-                        with open(benchmark_path, "rb") as file:
-                            yaml_config = yaml.full_load(file)
-                        assert "group" in yaml_config
-                        group = yaml_config["group"]
-                        all_task_list = yaml_config["task"]
-                        config_list = [
-                            task for task in all_task_list if type(task) != str
-                        ]
-                        task_list = [
-                            task for task in all_task_list if type(task) == str
-                        ]
-                        for task_config in config_list:
-                            var_configs = check_prompt_config(
-                                {
-                                    **task_config,
-                                    **{"group": group},
-                                }
-                            )
-                            for config in var_configs:
-                                register_configurable_task(config)
-                        task_names = utils.pattern_match(task_list, ALL_TASKS)
-                        for task in task_names:
-                            if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
-                                if group in GROUP_REGISTRY:
-                                    GROUP_REGISTRY[group].append(task)
-                                else:
-                                    GROUP_REGISTRY[group] = [task]
-                                    ALL_TASKS.add(group)
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load benchmark in\n"
-                            f"                                 {benchmark_path}\n"
-                            "                                 Benchmark will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_benchmarks(task_dir)
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,7 +11,6 @@ import numpy as np
 import lm_eval.api
 import lm_eval.tasks
-import lm_eval.benchmarks
 import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] MCTACO
 - [x] Pubmed QA
 - [x] SciQ
- [ ] QASPER
+- [x] QASPER
 - [x] QA4MRE
 - [x] TriviaQA
 - [x] AI2 ARC
@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] TruthfulQA (mc1)
 - [x] TruthfulQA (mc2)
 - [x] TruthfulQA (gen)
- [ ] MuTual
+- [x] MuTual
 - [ ] Hendrycks Math (Hailey)
 - [x] Asdiv
 - [ ] GSM8k

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -38,6 +38,34 @@ def register_configurable_task(config: Dict[str, str]) -> int:
    return 0
+def register_configurable_group(config: Dict[str, str]) -> int:
+    group = config["group"]
+    all_task_list = config["task"]
+    config_list = [task for task in all_task_list if type(task) != str]
+    task_list = [task for task in all_task_list if type(task) == str]
+    for task_config in config_list:
+        var_configs = check_prompt_config(
+            {
+                **task_config,
+                **{"group": group},
+            }
+        )
+        for config in var_configs:
+            register_configurable_task(config)
+    task_names = utils.pattern_match(task_list, ALL_TASKS)
+    for task in task_names:
+        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
+            if group in GROUP_REGISTRY:
+                GROUP_REGISTRY[group].append(task)
+            else:
+                GROUP_REGISTRY[group] = [task]
+                ALL_TASKS.add(group)
+    return 0
 def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
    all_configs = []
    if "use_prompt" in config:
@@ -76,7 +104,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
        return "{dataset_path}".format(**task_config)
-def include_task_folder(task_dir: str) -> None:
+def include_task_folder(task_dir: str, register_task=True) -> None:
    """
    Calling this function
    """
@@ -87,9 +115,16 @@ def include_task_folder(task_dir: str) -> None:
                    yaml_path = os.path.join(root, f)
                    try:
                        config = utils.load_yaml_config(yaml_path)
-                        all_configs = check_prompt_config(config)
-                        for config in all_configs:
+                        if register_task:
-                            register_configurable_task(config)
+                            all_configs = check_prompt_config(config)
+                            for config in all_configs:
+                                register_configurable_task(config)
+                        else:
+                            # If a `task` in config is a list,
+                            # that means it's a benchmark
+                            if type(config["task"]) == list:
+                                register_configurable_group(config)
                    except Exception as error:
                        eval_logger.warning(
@@ -102,6 +137,8 @@ def include_task_folder(task_dir: str) -> None:
 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
 include_task_folder(task_dir)
+# Register Benchmarks after all tasks have been added
+include_task_folder(task_dir, register_task=False)
 def get_task(task_name, config):

--- a/lm_eval/benchmarks/pythia.yaml
+++ b/lm_eval/benchmarks/pythia.yaml
--- a/lm_eval/benchmarks/t0_eval.yaml
+++ b/lm_eval/benchmarks/t0_eval.yaml
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
+  %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_bn_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
+  else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_de_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
+  %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_en_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
+  else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_es_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
+  else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
+include: cot_yaml
+task: mgsm_fr_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
+  "+question+"\nステップごとの答え:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ja_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
+  %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ru_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
+  else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_sw_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
+  %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_te_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
+  %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_th_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
+  endif %}'
+include: cot_yaml
+task: mgsm_zh_direct
--- a/lm_eval/tasks/mutual/README.md
+++ b/lm_eval/tasks/mutual/README.md
+# MuTual
+### Paper
+Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
+Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
+MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
+modified from Chinese high school English listening comprehension test data.
+Homepage: https://github.com/Nealcly/MuTual
+### Citation
+```
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+* `mutual`
+* `mutual_plus`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?