Unverified Commit 2387f39d authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'big-refactor' into flan-benchmark

parents 7601d828 784fe037
# Generated by utils.py
dataset_name: ro-en
dataset_path: wmt16
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Romanian phrase: {{translation["ro"]}}
English phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-ro-en
output_type: greedy_until
training_split: train
validation_split: validation
fewshot_split: validation
test_split: test
metric_list:
- metric: bleu
- metric: ter
- metric: chrf
generation_kwargs:
until:
- "\n"
do_sample: false
temperature: 0.0
repeats: 1
...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc): ...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
string = string.replace(" 's", "'s") string = string.replace(" 's", "'s")
return string return string
def process_results(doc, results):
(loglikelihood,) = results
# IMPORTANT: wikitext counts number of words in *original doc before detokenization*
_words = len(re.split(r"\s+", doc["page"]))
_bytes = len(doc["page"].encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
...@@ -7,6 +7,7 @@ validation_split: validation ...@@ -7,6 +7,7 @@ validation_split: validation
test_split: test test_split: test
doc_to_text: "" doc_to_text: ""
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
process_results: !function preprocess_wikitext.process_results
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{page}}" doc_to_decontamination_query: "{{page}}"
metric_list: metric_list:
......
# WSC273
### Paper
Title: `The Winograd Schema Challenge`
Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
A Winograd schema is a pair of sentences that differ in only one or two words
and that contain an ambiguity that is resolved in opposite ways in the two
sentences and requires the use of world knowledge and reasoning for its resolution.
The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.0
Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
### Citation
```
@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
title = "The winograd schema challenge",
abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
year = "2012",
language = "English (US)",
isbn = "9781577355601",
series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
}
```
### Groups and Tasks
#### Groups
* Not part of any group yet.
#### Tasks
* `wsc273`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: wsc273
dataset_path: winograd_wsc
dataset_name: wsc273
output_type: multiple_choice
test_split: test
doc_to_text: label
process_docs: !function utils.process_doc
doc_to_target: "{% set index = pronoun_loc + pronoun | length %}{{text[index:]}}"
doc_to_choice: "{% set template = text[:pronoun_loc] %}{{[template+options[0], template+options[1]]}}"
should_decontaminate: true
doc_to_decontamination_query: text
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
upper_pronouns = [
"A",
"An",
"The",
"She",
"He",
"It",
"They",
"My",
"His",
"Her",
"Their",
]
def process_doc(dataset):
def process_fn(doc):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = __normalize_option(doc, doc["options"][0])
doc["options"][1] = __normalize_option(doc, doc["options"][1])
return doc
return dataset.map(process_fn)
def __normalize_option(doc, option):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
if not start_of_sentence and pronoun in upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
...@@ -10,7 +10,7 @@ import collections ...@@ -10,7 +10,7 @@ import collections
import importlib.util import importlib.util
import fnmatch import fnmatch
from typing import List, Literal, Union from typing import Iterator, List, Literal, Union
import gc import gc
import torch import torch
...@@ -65,7 +65,7 @@ def join_iters(iters): ...@@ -65,7 +65,7 @@ def join_iters(iters):
yield from iter yield from iter
def chunks(iter, n=0, fn=None): def chunks(iter, n: int = 0, fn=None):
arr = [] arr = []
for i, x in enumerate(iter): for i, x in enumerate(iter):
arr.append(x) arr.append(x)
...@@ -87,11 +87,11 @@ def group(arr, fn): ...@@ -87,11 +87,11 @@ def group(arr, fn):
class MultiChoice: class MultiChoice:
def __init__(self, choices): def __init__(self, choices) -> None:
self.choices = choices self.choices = choices
# Simple wildcard support (linux filename patterns) # Simple wildcard support (linux filename patterns)
def __contains__(self, values): def __contains__(self, values) -> bool:
for value in values.split(","): for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0: if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.info(f"Available tasks to choose:") eval_logger.info(f"Available tasks to choose:")
...@@ -100,7 +100,7 @@ class MultiChoice: ...@@ -100,7 +100,7 @@ class MultiChoice:
raise ValueError("'{}' is not in task list".format(value)) raise ValueError("'{}' is not in task list".format(value))
return True return True
def __iter__(self): def __iter__(self) -> Iterator:
for choice in self.choices: for choice in self.choices:
yield choice yield choice
...@@ -108,7 +108,6 @@ class MultiChoice: ...@@ -108,7 +108,6 @@ class MultiChoice:
# Returns a list containing all values of the source_list that # Returns a list containing all values of the source_list that
# match at least one of the patterns # match at least one of the patterns
def pattern_match(patterns, source_list): def pattern_match(patterns, source_list):
if type(patterns) == str: if type(patterns) == str:
patterns = [patterns] patterns = [patterns]
...@@ -177,7 +176,7 @@ def make_disjoint_window(pair): ...@@ -177,7 +176,7 @@ def make_disjoint_window(pair):
class Reorderer: class Reorderer:
def __init__(self, arr, fn): def __init__(self, arr, fn) -> None:
self.size = len(arr) self.size = len(arr)
arr = list(enumerate(arr)) arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1])) arr = group(arr, lambda x: fn(x[1]))
...@@ -212,7 +211,7 @@ class Grouper: ...@@ -212,7 +211,7 @@ class Grouper:
objects in `arr` satisfying `key == fn(ob)`. objects in `arr` satisfying `key == fn(ob)`.
""" """
def __init__(self, arr, fn): def __init__(self, arr, fn) -> None:
# self.orig_arr = arr # self.orig_arr = arr
self.size = len(arr) self.size = len(arr)
arr = list(enumerate(arr)) arr = list(enumerate(arr))
...@@ -263,7 +262,7 @@ class Grouper: ...@@ -263,7 +262,7 @@ class Grouper:
return res return res
def make_table(result_dict, column="results"): def make_table(result_dict, column: str = "results"):
"""Generate table of results.""" """Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter from pytablewriter import MarkdownTableWriter, LatexTableWriter
...@@ -393,7 +392,6 @@ def get_git_commit_hash(): ...@@ -393,7 +392,6 @@ def get_git_commit_hash():
def import_function(loader, node): def import_function(loader, node):
function_name = loader.construct_scalar(node) function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name) yaml_path = os.path.dirname(loader.name)
...@@ -451,7 +449,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None): ...@@ -451,7 +449,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
return yaml_config return yaml_config
def regex_replace(string, pattern, repl, count=0): def regex_replace(string, pattern, repl, count: int = 0):
"""Implements the `re.sub` function as a custom Jinja filter.""" """Implements the `re.sub` function as a custom Jinja filter."""
return re.sub(pattern, repl, string, count=count) return re.sub(pattern, repl, string, count=count)
...@@ -525,7 +523,7 @@ def pad_and_concat( ...@@ -525,7 +523,7 @@ def pad_and_concat(
return torch.cat(tensors, dim=0) return torch.cat(tensors, dim=0)
def clear_torch_cache(): def clear_torch_cache() -> None:
gc.collect() gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
...@@ -550,7 +548,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria): ...@@ -550,7 +548,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
tokenizer: transformers.PreTrainedTokenizer, tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int, initial_decoder_input_length: int,
batch_size: int, batch_size: int,
): ) -> None:
self.initial_decoder_input_length = initial_decoder_input_length self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size self.done_tracker = [False] * batch_size
self.sequence = sequence self.sequence = sequence
......
...@@ -9,24 +9,26 @@ from pathlib import Path ...@@ -9,24 +9,26 @@ from pathlib import Path
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_task_folder from lm_eval.tasks import include_task_folder
from lm_eval.benchmarks import include_benchmarks from lm_eval.benchmarks import include_benchmarks
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
def parse_args(): def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`") parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
parser.add_argument(
"--tasks",
default=None,
help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
)
parser.add_argument( parser.add_argument(
"--model_args", "--model_args",
default="", default="",
help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
) )
parser.add_argument(
"--tasks", default=None # , choices=utils.MultiChoice(sorted(ALL_TASKS))
)
parser.add_argument( parser.add_argument(
"--num_fewshot", "--num_fewshot",
type=int, type=int,
...@@ -99,7 +101,7 @@ def parse_args(): ...@@ -99,7 +101,7 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def main(): def main() -> None:
args = parse_args() args = parse_args()
if args.limit: if args.limit:
...@@ -126,10 +128,21 @@ def main(): ...@@ -126,10 +128,21 @@ def main():
else: else:
tasks_list = args.tasks.split(",") tasks_list = args.tasks.split(",")
task_names = utils.pattern_match(tasks_list, ALL_TASKS) task_names = utils.pattern_match(tasks_list, ALL_TASKS)
task_missing = []
for task in [task for task in tasks_list if task not in task_names]: for task in [task for task in tasks_list if task not in task_names]:
if os.path.isfile(task): if os.path.isfile(task):
config = utils.load_yaml_config(task) config = utils.load_yaml_config(task)
task_names.append(config) task_names.append(config)
else:
task_missing.append(task)
if task_missing != []:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n"
f"{SPACING}Try `lm-eval -h` for list of available tasks",
)
raise ValueError(f"Tasks {missing} were not found.")
if args.output_path: if args.output_path:
path = Path(args.output_path) path = Path(args.output_path)
......
[mypy]
python_version = 3.9
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
warn_redundant_casts = True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors = True
[mypy-lm_eval.api.*]
ignore_errors = True
[mypy-lm_eval.prompts.*]
ignore_errors = True
[mypy-lm_eval.models.*]
ignore_errors = True
[mypy-scripts.*]
ignore_errors = True
[mypy-main]
ignore_errors = True
...@@ -15,7 +15,7 @@ extras_require = { ...@@ -15,7 +15,7 @@ extras_require = {
], ],
"testing": ["pytest", "pytest-cov", "pytest-xdist"], "testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"], "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"], "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
"promptsource": [ "promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource" "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
], ],
...@@ -53,7 +53,7 @@ setuptools.setup( ...@@ -53,7 +53,7 @@ setuptools.setup(
], ],
python_requires=">=3.9", python_requires=">=3.9",
install_requires=[ install_requires=[
"accelerate>=0.18.0", "accelerate>=0.21.0",
"evaluate", "evaluate",
"datasets>=2.0.0", "datasets>=2.0.0",
"evaluate>=0.4.0", "evaluate>=0.4.0",
...@@ -62,10 +62,9 @@ setuptools.setup( ...@@ -62,10 +62,9 @@ setuptools.setup(
"omegaconf>=2.2", "omegaconf>=2.2",
"peft>=0.2.0", "peft>=0.2.0",
"pybind11>=2.6.2", "pybind11>=2.6.2",
"pycountry",
"pytablewriter", "pytablewriter",
"rouge-score>=0.0.4", "rouge-score>=0.0.4",
"sacrebleu==1.5.0", "sacrebleu>=1.5.0",
"scikit-learn>=0.24.1", "scikit-learn>=0.24.1",
"sqlitedict", "sqlitedict",
"torch>=1.8", "torch>=1.8",
......
import pytest
from itertools import islice
import lm_eval.tasks as tasks
from .utilities_testing import load_changed_files, parser
from typing import List
from lm_eval.api.task import ConfigurableTask
import os
# GitHub CI
def new_tasks() -> List[str]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return ["arc_easy"]
def get_task_class() -> List[ConfigurableTask]:
task_name = new_tasks()
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x
@pytest.fixture()
def limit() -> int:
return 10
# Tests
@pytest.mark.parametrize("task_class", get_task_class())
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(self, task_class):
assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(self, task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(self, task_class):
task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
def test_create_choices(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
with open(file_path, "r") as f:
return [l for line in f.readlines() for l in line.strip().split(" ")]
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
from __future__ import annotations from __future__ import annotations
import pytest import pytest
from pathlib import Path
import numpy as np import numpy as np
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
import sys
import torch
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True)
version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
...@@ -90,8 +94,15 @@ class Test_HFLM: ...@@ -90,8 +94,15 @@ class Test_HFLM:
def test_logliklihood(self) -> None: def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH) res = self.LM.loglikelihood(self.MULTIPLE_CH)
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res] _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
# change atol in case of consistent failure # log samples to CI
assert np.allclose(_res, _RES, atol=1e-4) dir_path = Path("test_logs")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve()
with open(file_path, "w") as f:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice # check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax( argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1 np.array(_RES).reshape(-1, 4), axis=1
......
from itertools import islice from itertools import islice
import pytest import pytest
from typing import List from .utils import new_tasks
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask from lm_eval.api.task import ConfigurableTask
# Using fixtures to get the task class and limit
@pytest.fixture() # Default Task
def task_class() -> ConfigurableTask: TASKS = ["arc_easy"]
task_name = ["arc_easy"]
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x[0] def task_class():
global TASKS
# CI: new_tasks checks if any modifications have been made
task_classes = new_tasks()
# Check if task_classes is empty
if task_classes:
return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
else:
return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
@pytest.fixture() @pytest.fixture()
...@@ -18,109 +26,96 @@ def limit() -> int: ...@@ -18,109 +26,96 @@ def limit() -> int:
# Tests # Tests
@pytest.mark.parametrize("task_class", task_class())
class TestNewTasks:
def test_download(task_class: ConfigurableTask): def test_download(self, task_class: ConfigurableTask):
task_class().download() task_class.download()
assert task_class().dataset is not None assert task_class.dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
def test_has_training_docs(task_class: ConfigurableTask): assert task_class.has_training_docs() in [True, False]
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
def test_check_training_docs(task_class: ConfigurableTask): assert task_class._config["training_split"] is not None
task = task_class()
if task.has_training_docs(): def test_has_validation_docs(self, task_class):
assert task._config["training_split"] is not None assert task_class.has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
def test_has_validation_docs(task_class): if task_class.has_validation_docs():
assert task_class().has_validation_docs() in [True, False] assert task_class._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
def test_check_validation_docs(task_class): assert task_class.has_test_docs() in [True, False]
task = task_class()
if task.has_validation_docs(): def test_check_test_docs(self, task_class):
assert task._config["validation_split"] is not None task = task_class
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_has_test_docs(task_class):
assert task_class().has_test_docs() in [True, False] def test_should_decontaminate(self, task_class):
task = task_class
assert task.should_decontaminate() in [True, False]
def test_check_test_docs(task_class): if task.should_decontaminate():
task = task_class() assert task._config["doc_to_decontamination_query"] is not None
if task.has_test_docs():
assert task._config["test_split"] is not None def test_doc_to_text(self, task_class, limit):
task = task_class
arr = (
def test_should_decontaminate(task_class): list(islice(task.test_docs(), limit))
task = task_class() if task.has_test_docs()
assert task.should_decontaminate() in [True, False] else list(islice(task.validation_docs(), limit))
if task.should_decontaminate(): )
assert task._config["doc_to_decontamination_query"] is not None _array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
def test_doc_to_text(task_class, limit): isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
task = task_class() for x in _array
arr = ( )
list(islice(task.test_docs(), limit))
if task.has_test_docs() def test_create_choices(self, task_class, limit):
else list(islice(task.validation_docs(), limit)) task = task_class
) arr = (
_array = [task.doc_to_text(doc) for doc in arr] list(islice(task.test_docs(), limit))
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on if task.has_test_docs()
assert all( else list(islice(task.validation_docs(), limit))
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array )
) if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
def test_create_choices(task_class, limit): assert all(isinstance(x, list) for x in _array)
task = task_class() assert all(isinstance(x[0], str) for x in _array)
arr = (
list(islice(task.test_docs(), limit)) def test_doc_to_target(self, task_class, limit):
if task.has_test_docs() task = task_class
else list(islice(task.validation_docs(), limit)) arr = (
) list(islice(task.test_docs(), limit))
if "multiple_choice" in task._config.output_type: if task.has_test_docs()
_array = [task.doc_to_choice(doc) for doc in arr] else list(islice(task.validation_docs(), limit))
# assert all(len(x) == 4 for x in _array) )
assert all(isinstance(x, list) for x in _array) _array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(x[0], str) for x in _array) if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
def test_doc_to_target(task_class, limit): # Not working
task = task_class() # assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
arr = (
list(islice(task.test_docs(), limit)) def test_build_all_requests(self, task_class, limit):
if task.has_test_docs() task_class.build_all_requests(rank=1, limit=limit, world_size=1)
else list(islice(task.validation_docs(), limit)) assert task_class.instances is not None
)
_array_target = [task.doc_to_target(doc) for doc in arr] # ToDO: Add proper testing
if task._config.output_type == "multiple_choice": def test_construct_requests(self, task_class, limit):
assert all(isinstance(label, int) for label in _array_target) task = task_class
# _array_text = [task.doc_to_text(doc) for doc in arr] arr = (
# Not working list(islice(task.test_docs(), limit))
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target)) if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
def test_build_all_requests(task_class, limit): requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
task_class().build_all_requests(rank=1, limit=limit, world_size=1) # assert all(isinstance(doc, list) for doc in requests)
assert task_class.instances is not None assert len(requests) == limit if limit else True
# ToDO: Add proper testing
def test_construct_requests(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
# def test_create_choices(task_class): # def test_create_choices(task_class):
......
import json
from typing import List from typing import List
from lm_eval.utils import load_yaml_config from lm_eval.utils import load_yaml_config
from pathlib import Path from pathlib import Path
import sys from typing import Union
import os
# {{{CI}}}
# This is the path where the output for the changed files for the tasks folder is stored # This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt" # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words # reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files # used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]: def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f: with open(file_path, "r") as f:
content = f.read() content = f.read()
words_list = [x for x in content.split()] words_list = [x for x in content.split()]
sys.stdout.write(f"list of files: {words_list}")
return words_list return words_list
...@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]: ...@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]:
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))] path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path} _output |= {load_yaml_config(x)["task"] for x in path}
return list(_output) return list(_output)
def new_tasks() -> Union[list[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment