Unverified Commit fa2ae334 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge branch 'big-refactor' into bump-deps

parents 7c2687cb 54a53d6f
name: Tasks Modified # name: Tasks Modified
on: # on:
push: # push:
branches: # branches:
- 'big-refactor*' # - 'big-refactor*'
pull_request: # pull_request:
branches: # branches:
- 'big-refactor*' # - 'big-refactor*'
workflow_dispatch: # workflow_dispatch:
# comment/edit out the above to stop/change the triggers # # comment/edit out the above to stop/change the triggers
jobs: # jobs:
changed_files: # changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest # runs-on: ubuntu-latest # windows-latest || macos-latest
timeout-minutes: 120 # timeout-minutes: 120
name: Scan for changed tasks # name: Scan for changed tasks
steps: # steps:
- name: checkout # - name: checkout
uses: actions/checkout@v3 # uses: actions/checkout@v3
with: # with:
fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. # fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# Uses the tj-actions/changed-files@v37 action to check for changes. # # Uses the tj-actions/changed-files@v37 action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs # # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# The `files_yaml` input optionally takes a yaml string to specify filters, # # The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the filter name to the standard output names. # # and prepends the filter name to the standard output names.
- name: Check task folders # - name: Check task folders
id: changed-tasks # id: changed-tasks
uses: tj-actions/changed-files@v37.1.2 # uses: tj-actions/changed-files@v37.1.2
with: # with:
# tasks checks the tasks folder and api checks the api folder for changes # # tasks checks the tasks folder and api checks the api folder for changes
files_yaml: | # files_yaml: |
tasks: # tasks:
- lm_eval/tasks/** # - lm_eval/tasks/**
api: # api:
- lm_eval/api/** # - lm_eval/api/**
write_output_files: true # write_output_files: true
# The next step is optional; the files are written to the workspace by default (above). # # The next step is optional; the files are written to the workspace by default (above).
# so it's just for debugging # # so it's just for debugging
- name: Run Tests # - name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' # if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | # run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' # echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed." # echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" # echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9 # - name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' # if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4 # uses: actions/setup-python@v4
with: # with:
python-version: 3.9 # python-version: 3.9
cache: 'pip' # cache: 'pip'
cache-dependency-path: setup.py # cache-dependency-path: setup.py
- name: Install dependencies # - name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' # if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | # run: |
python -m pip install --upgrade pip # python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu # pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest # - name: Test with pytest
# if new tasks are added, run tests on them # # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' # if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv # run: python -m pytest tests/test_tasks.py -s -vv
# if api is modified, run tests on it # # if api is modified, run tests on it
- name: Test more tasks with pytest # - name: Test more tasks with pytest
env: # env:
API: true # API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true' # if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv # run: python -m pytest tests/test_tasks.py -s -vv
...@@ -40,39 +40,38 @@ jobs: ...@@ -40,39 +40,38 @@ jobs:
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# mypy turned off for now # # mypy turned off for now
# - name: Lint with mypy # - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable # run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
# Job 2 # Job 2
testcpu: # testcpu:
name: CPU Tests # name: CPU Tests
runs-on: ubuntu-latest # runs-on: ubuntu-latest
strategy: # strategy:
matrix: # matrix:
python-version: ["3.8", "3.9", "3.10", "3.11" ] # python-version: [ "3.8", "3.9", "3.10", "3.11" ]
timeout-minutes: 30 # timeout-minutes: 30
# steps:
steps: # - name: Checkout Code
- name: Checkout Code # uses: actions/checkout@v3
uses: actions/checkout@v3 # - name: Set up Python ${{ matrix.python-version }}
- name: Set up Python ${{ matrix.python-version }} # uses: actions/setup-python@v4
uses: actions/setup-python@v4 # with:
with: # python-version: ${{ matrix.python-version }}
python-version: ${{ matrix.python-version }} # cache: pip
cache: pip # cache-dependency-path: setup.py
cache-dependency-path: setup.py # - name: Install dependencies
- name: Install dependencies # run: |
run: | # python -m pip install --upgrade pip
python -m pip install --upgrade pip # pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu # # Install optional git dependencies
# Install optional git dependencies # # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # - name: Test with pytest
- name: Test with pytest # run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra # - name: Archive artifacts
- name: Archive artifacts # uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v3 # with:
with: # name: output_results
name: output_results # path: |
path: | # test_logs/*
test_logs/*
import os
import yaml
from lm_eval import utils
from lm_eval.tasks import register_configurable_task, check_prompt_config
from lm_eval.logger import eval_logger
from lm_eval.api.registry import (
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
def include_benchmarks(task_dir: str) -> None:
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
assert "group" in yaml_config
group = yaml_config["group"]
all_task_list = yaml_config["task"]
config_list = [
task for task in all_task_list if type(task) != str
]
task_list = [
task for task in all_task_list if type(task) == str
]
for task_config in config_list:
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_benchmarks(task_dir)
...@@ -11,7 +11,6 @@ import numpy as np ...@@ -11,7 +11,6 @@ import numpy as np
import lm_eval.api import lm_eval.api
import lm_eval.tasks import lm_eval.tasks
import lm_eval.benchmarks
import lm_eval.models import lm_eval.models
import lm_eval.api.metrics import lm_eval.api.metrics
import lm_eval.api.registry import lm_eval.api.registry
......
...@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MCTACO - [x] MCTACO
- [x] Pubmed QA - [x] Pubmed QA
- [x] SciQ - [x] SciQ
- [ ] QASPER - [x] QASPER
- [x] QA4MRE - [x] QA4MRE
- [x] TriviaQA - [x] TriviaQA
- [x] AI2 ARC - [x] AI2 ARC
...@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] TruthfulQA (mc1) - [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2) - [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen) - [x] TruthfulQA (gen)
- [ ] MuTual - [x] MuTual
- [ ] Hendrycks Math (Hailey) - [ ] Hendrycks Math (Hailey)
- [x] Asdiv - [x] Asdiv
- [ ] GSM8k - [ ] GSM8k
......
...@@ -38,6 +38,34 @@ def register_configurable_task(config: Dict[str, str]) -> int: ...@@ -38,6 +38,34 @@ def register_configurable_task(config: Dict[str, str]) -> int:
return 0 return 0
def register_configurable_group(config: Dict[str, str]) -> int:
group = config["group"]
all_task_list = config["task"]
config_list = [task for task in all_task_list if type(task) != str]
task_list = [task for task in all_task_list if type(task) == str]
for task_config in config_list:
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
return 0
def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]: def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
all_configs = [] all_configs = []
if "use_prompt" in config: if "use_prompt" in config:
...@@ -76,7 +104,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str: ...@@ -76,7 +104,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
return "{dataset_path}".format(**task_config) return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir: str) -> None: def include_task_folder(task_dir: str, register_task=True) -> None:
""" """
Calling this function Calling this function
""" """
...@@ -87,9 +115,16 @@ def include_task_folder(task_dir: str) -> None: ...@@ -87,9 +115,16 @@ def include_task_folder(task_dir: str) -> None:
yaml_path = os.path.join(root, f) yaml_path = os.path.join(root, f)
try: try:
config = utils.load_yaml_config(yaml_path) config = utils.load_yaml_config(yaml_path)
all_configs = check_prompt_config(config)
for config in all_configs: if register_task:
register_configurable_task(config) all_configs = check_prompt_config(config)
for config in all_configs:
register_configurable_task(config)
else:
# If a `task` in config is a list,
# that means it's a benchmark
if type(config["task"]) == list:
register_configurable_group(config)
except Exception as error: except Exception as error:
eval_logger.warning( eval_logger.warning(
...@@ -102,6 +137,8 @@ def include_task_folder(task_dir: str) -> None: ...@@ -102,6 +137,8 @@ def include_task_folder(task_dir: str) -> None:
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/" task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_task_folder(task_dir) include_task_folder(task_dir)
# Register Benchmarks after all tasks have been added
include_task_folder(task_dir, register_task=False)
def get_task(task_name, config): def get_task(task_name, config):
......
# Generated by utils.py
dataset_name: bn
doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
%}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
include: cot_yaml
task: mgsm_bn_direct
# Generated by utils.py
dataset_name: de
doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
include: cot_yaml
task: mgsm_de_direct
# Generated by utils.py
dataset_name: en
doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
%}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
include: cot_yaml
task: mgsm_en_direct
# Generated by utils.py
dataset_name: es
doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
include: cot_yaml
task: mgsm_es_direct
# Generated by utils.py
dataset_name: fr
doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
include: cot_yaml
task: mgsm_fr_direct
# Generated by utils.py
dataset_name: ja
doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
"+question+"\nステップごとの答え:"}}{% endif %}'
include: cot_yaml
task: mgsm_ja_direct
# Generated by utils.py
dataset_name: ru
doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
%}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
include: cot_yaml
task: mgsm_ru_direct
# Generated by utils.py
dataset_name: sw
doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
include: cot_yaml
task: mgsm_sw_direct
# Generated by utils.py
dataset_name: te
doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
%}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
include: cot_yaml
task: mgsm_te_direct
# Generated by utils.py
dataset_name: th
doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
%}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
include: cot_yaml
task: mgsm_th_direct
# Generated by utils.py
dataset_name: zh
doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
endif %}'
include: cot_yaml
task: mgsm_zh_direct
# MuTual
### Paper
Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
### Citation
```
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `mutual`
* `mutual_plus`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment