Commit 4c139701 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'big-refactor' into bump-deps

parents 5794ec3c cc547c7b
...@@ -63,10 +63,10 @@ jobs: ...@@ -63,10 +63,10 @@ jobs:
- name: Test with pytest - name: Test with pytest
# if new tasks are added, run tests on them # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
# if api is modified, run tests on it # if api is modified, run tests on it
- name: Test more tasks with pytest - name: Test more tasks with pytest
env: env:
API: true API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
...@@ -674,22 +674,22 @@ class ConfigurableTask(Task): ...@@ -674,22 +674,22 @@ class ConfigurableTask(Task):
check_choices = test_choice check_choices = test_choice
else: else:
check_choices = [test_target] check_choices = [test_target]
if self.config.doc_to_choice is not None:
for choice in check_choices: for choice in check_choices:
choice_has_whitespace = True if " " in choice else False choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = ( delimiter_has_whitespace = (
True if " " in self.config.target_delimiter else False True if self.config.target_delimiter[-1].isspace() else False
)
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
) )
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)
def download(self, dataset_kwargs=None) -> None: def download(self, dataset_kwargs=None) -> None:
self.dataset = datasets.load_dataset( self.dataset = datasets.load_dataset(
path=self.DATASET_PATH, path=self.DATASET_PATH,
...@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task): ...@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number. # it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
else: else:
gold = str(gold) gold = str(gold)
...@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task): ...@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
# return true if any are true # return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics # TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = [] scores = []
if not isinstance(gold, list):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
# print(gold)
gold = [gold]
for gold_option in gold: for gold_option in gold:
try: try:
result_score = self._metric_fn_list[metric]( result_score = self._metric_fn_list[metric](
......
task: nq_open
dataset_path: nq_open
output_type: greedy_until
training_split: train
validation_split: validation
description: "Answer these questions:\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target
fewshot_delimiter: "\n"
generation_kwargs:
until:
- "\n"
- "."
- ","
do_sample: false
temperature: 0.0
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
target_delimiter: " "
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\ban|a|the\b"
...@@ -38,13 +38,15 @@ def main(): ...@@ -38,13 +38,15 @@ def main():
iters = [] iters = []
for set in args.sets.split(","): for set in args.sets.split(","):
docs = None
if set == "train" and task.has_training_docs(): if set == "train" and task.has_training_docs():
docs = task.training_docs() docs = task.training_docs()
if set == "val" and task.has_validation_docs(): if set == "val" and task.has_validation_docs():
docs = task.validation_docs() docs = task.validation_docs()
if set == "test" and task.has_test_docs(): if set == "test" and task.has_test_docs():
docs = task.test_docs() docs = task.test_docs()
iters.append(docs) if docs is not None:
iters.append(docs)
docs = join_iters(iters) docs = join_iters(iters)
......
...@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks ...@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
# import lm_eval.models as models # import lm_eval.models as models
import lm_eval.api as api import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
from typing import List
import random import random
import pytest import pytest
...@@ -26,7 +27,7 @@ import pytest ...@@ -26,7 +27,7 @@ import pytest
) )
], ],
) )
def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str): def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
task_name = task_name task_name = task_name
limit = 10 limit = 10
......
...@@ -9,6 +9,7 @@ import os ...@@ -9,6 +9,7 @@ import os
# This is the path where the output for the changed files for the tasks folder is stored # This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt" # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words # reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files # used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]: def load_changed_files(file_path: str) -> List[str]:
...@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]: ...@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
return list(_output) return list(_output)
def new_tasks() -> Union[list[str], None]: def new_tasks() -> Union[List[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt" FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME): if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME # If tasks folder has changed then we get the list of files from FILENAME
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment