Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into squadv2

3263c572 · lintangsutawika · a27e8ed1 · 33d52483 · 3263c572 · 3263c572
Commit 3263c572 authored Sep 18, 2023 by lintangsutawika
14 changed files
--- a/lm_eval/tasks/wsc273/default.yaml
+++ b/lm_eval/tasks/wsc273/default.yaml
+task: wsc273
+dataset_path: winograd_wsc
+dataset_name: wsc273
+output_type: multiple_choice
+test_split: test
+doc_to_text: label
+process_docs: !function utils.process_doc
+doc_to_target: "{% set index = pronoun_loc + pronoun | length %}{{text[index:]}}"
+doc_to_choice: "{% set template = text[:pronoun_loc] %}{{[template+options[0], template+options[1]]}}"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/wsc273/utils.py
+++ b/lm_eval/tasks/wsc273/utils.py
+upper_pronouns = [
+    "A",
+    "An",
+    "The",
+    "She",
+    "He",
+    "It",
+    "They",
+    "My",
+    "His",
+    "Her",
+    "Their",
+]
+def process_doc(dataset):
+    def process_fn(doc):
+        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
+        doc["text"] = doc["text"].replace("  ", " ")
+        doc["options"][0] = __normalize_option(doc, doc["options"][0])
+        doc["options"][1] = __normalize_option(doc, doc["options"][1])
+        return doc
+    return dataset.map(process_fn)
+def __normalize_option(doc, option):
+    # Append `'s` to possessive determiner based options.
+    if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
+        option += "'s"
+    # Appropriately lowercase the pronoun in the option.
+    pronoun = option.split()[0]
+    start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
+    if not start_of_sentence and pronoun in upper_pronouns:
+        return option.replace(pronoun, pronoun.lower())
+    return option
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,13 +10,12 @@ import collections
 import importlib.util
 import fnmatch
-from typing import List, Literal, Union
+from typing import Iterator, List, Literal, Union
 import gc
 import torch
 import transformers
-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
+    arg_list = [arg for arg in args_string.split(",") if arg]
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
    return args_dict
@@ -65,7 +64,7 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n=0, fn=None):
+def chunks(iter, n: int = 0, fn=None):
    arr = []
    for i, x in enumerate(iter):
        arr.append(x)
@@ -87,11 +86,11 @@ def group(arr, fn):
 class MultiChoice:
-    def __init__(self, choices):
+    def __init__(self, choices) -> None:
        self.choices = choices
    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
+    def __contains__(self, values) -> bool:
        for value in values.split(","):
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.info(f"Available tasks to choose:")
@@ -100,7 +99,7 @@ class MultiChoice:
                raise ValueError("'{}' is not in task list".format(value))
        return True
-    def __iter__(self):
+    def __iter__(self) -> Iterator:
        for choice in self.choices:
            yield choice
@@ -108,7 +107,6 @@ class MultiChoice:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
    if type(patterns) == str:
        patterns = [patterns]
@@ -177,7 +175,7 @@ def make_disjoint_window(pair):
 class Reorderer:
-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        self.size = len(arr)
        arr = list(enumerate(arr))
        arr = group(arr, lambda x: fn(x[1]))
@@ -212,7 +210,7 @@ class Grouper:
    objects in `arr` satisfying `key == fn(ob)`.
    """
-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        # self.orig_arr = arr
        self.size = len(arr)
        arr = list(enumerate(arr))
@@ -263,14 +261,14 @@ class Grouper:
        return res
-def make_table(result_dict, column="results"):
+def make_table(result_dict, column: str = "results"):
    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter
    if column == "results":
-        column_name = "Task"
+        column_name = "Tasks"
-    elif column == "aggregate":
+    elif column == "groups":
-        column_name = "Benchmark"
+        column_name = "Groups"
    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
@@ -393,7 +391,6 @@ def get_git_commit_hash():
 def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)
@@ -428,7 +425,6 @@ def load_yaml_config(yaml_path):
            include_path.reverse()
            final_yaml_config = {}
            for path in include_path:
                # Assumes that path is a full path.
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
@@ -447,7 +443,7 @@ def load_yaml_config(yaml_path):
        return yaml_config
-def regex_replace(string, pattern, repl, count=0):
+def regex_replace(string, pattern, repl, count: int = 0):
    """Implements the `re.sub` function as a custom Jinja filter."""
    return re.sub(pattern, repl, string, count=count)
@@ -521,7 +517,7 @@ def pad_and_concat(
    return torch.cat(tensors, dim=0)
-def clear_torch_cache():
+def clear_torch_cache() -> None:
    gc.collect()
    torch.cuda.empty_cache()
@@ -546,7 +542,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        tokenizer: transformers.PreTrainedTokenizer,
        initial_decoder_input_length: int,
        batch_size: int,
-    ):
+    ) -> None:
        self.initial_decoder_input_length = initial_decoder_input_length
        self.done_tracker = [False] * batch_size
        self.sequence = sequence

--- a/main.py
+++ b/main.py
@@ -9,23 +9,26 @@ from pathlib import Path
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger
+from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
+from lm_eval.benchmarks import include_benchmarks
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def parse_args():
+def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
+    parser.add_argument(
+        "--tasks",
+        default=None,
+        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+    )
    parser.add_argument(
        "--model_args",
        default="",
        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
-    parser.add_argument(
-        "--tasks", default=None  # , choices=utils.MultiChoice(sorted(ALL_TASKS))
-    )
    parser.add_argument(
        "--num_fewshot",
        type=int,
@@ -98,7 +101,7 @@ def parse_args():
    return parser.parse_args()
-def main():
+def main() -> None:
    args = parse_args()
    if args.limit:
@@ -125,10 +128,21 @@ def main():
        else:
            tasks_list = args.tasks.split(",")
            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
+            task_missing = []
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)
+                else:
+                    task_missing.append(task)
+        if task_missing != []:
+            missing = ", ".join(task_missing)
+            eval_logger.error(
+                f"Tasks were not found: {missing}\n"
+                f"{SPACING}Try `lm-eval -h` for list of available tasks",
+            )
+            raise ValueError(f"Tasks {missing} were not found.")
    if args.output_path:
        path = Path(args.output_path)
@@ -195,8 +209,8 @@ def main():
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
-        if "aggregate" in results:
+        if "groups" in results:
-            print(evaluator.make_table(results, "aggregate"))
+            print(evaluator.make_table(results, "groups"))
 if __name__ == "__main__":

--- a/mypy.ini
+++ b/mypy.ini
+[mypy]
+python_version = 3.9
+show_traceback = True
+check_untyped_defs = True
+no_implicit_reexport = True
+warn_unreachable = True
+warn_unused_configs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+# We ignore errors everywhere to gradually add type annotations
+[mypy-lm_eval.*]
+ignore_errors = True
+[mypy-lm_eval.api.*]
+ignore_errors = True
+[mypy-lm_eval.prompts.*]
+ignore_errors = True
+[mypy-lm_eval.models.*]
+ignore_errors = True
+[mypy-scripts.*]
+ignore_errors = True
+[mypy-main]
+ignore_errors = True
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools>=40.8.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "lm_eval"
+version = "1.0.0"
+authors = [
+    {name="EleutherAI", email="contact@eleuther.ai"}
+]
+description = "A framework for evaluating language models"
+readme = "README.md"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+requires-python = ">=3.9"
+license = { "text" = "MIT" }
+dependencies = [
+    "accelerate>=0.21.0",
+    "evaluate",
+    "datasets>=2.0.0",
+    "evaluate>=0.4.0",
+    "jsonlines",
+    "numexpr",
+    "peft>=0.2.0",
+    "pybind11>=2.6.2",
+    "pytablewriter",
+    "rouge-score>=0.0.4",
+    "sacrebleu>=1.5.0",
+    "scikit-learn>=0.24.1",
+    "sqlitedict",
+    "torch>=1.8",
+    "tqdm-multiprocess",
+    "transformers>=4.1",
+    "zstandard",
+]
+[tool.setuptools]
+packages = ["lm_eval"]
+# required to include yaml files in pip installation
+[tool.setuptools.package-data]
+lm_eval = ["**/*.yaml", "tasks/**/*"]
+examples = ["**/*.yaml"]
+[project.scripts]
+lm-eval = "main:main"
+lm_eval = "main:main"
+[project.urls]
+Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
+Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
+[project.optional-dependencies]
+dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
+linting = [
+    "flake8",
+    "pylint",
+    "mypy",
+    "pre-commit",
+]
+testing = ["pytest", "pytest-cov", "pytest-xdist"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+promptsource = [
+    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+]
+gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+anthropic = ["anthropic"]
+openai = ["openai", "tiktoken"]
+all = [
+    "lm_eval[dev]",
+    "lm_eval[testing]",
+    "lm_eval[linting]",
+    "lm_eval[multilingual]",
+    "lm_eval[sentencepiece]",
+    "lm_eval[promptsource]",
+    "lm_eval[gptq]",
+    "lm_eval[anthropic]",
+    "lm_eval[openai]"
+]
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,13 +38,15 @@ def main():
        iters = []
        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
-            iters.append(docs)
+            if docs is not None:
+                iters.append(docs)
        docs = join_iters(iters)

--- a/setup.py
+++ b/setup.py
 import setuptools
-import itertools
-with open("README.md", "r", encoding="utf-8") as fh:
+# This is to make sure that the package supports editable installs
-    long_description = fh.read()
+setuptools.setup()
-extras_require = {
-    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-    "linting": [
-        "flake8",
-        "pylint",
-        "mypy",
-        "pre-commit",
-    ],
-    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
-    "promptsource": [
-        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-    ],
-    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-    "anthropic": ["anthropic"],
-    "openai": ["openai", "tiktoken"],
-}
-extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
-setuptools.setup(
-    name="lm_eval",
-    version="1.0.0",
-    author="EleutherAI",
-    author_email="contact@eleuther.ai",
-    description="A framework for evaluating language models",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
-    # required to include yaml files in pip installation
-    package_data={
-        "lm_eval": ["**/*.yaml", "tasks/**/*"],
-        "examples": ["**/*.yaml"],
-    },
-    entry_points={
-        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
-    },
-    include_package_data=True,
-    classifiers=[
-        "Development Status :: 3 - Alpha",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.9",
-    install_requires=[
-        "accelerate>=0.18.0",
-        "evaluate",
-        "datasets>=2.0.0",
-        "evaluate>=0.4.0",
-        "jsonlines",
-        "numexpr",
-        "omegaconf>=2.2",
-        "peft>=0.2.0",
-        "pybind11>=2.6.2",
-        "pycountry",
-        "pytablewriter",
-        "rouge-score>=0.0.4",
-        "sacrebleu==1.5.0",
-        "scikit-learn>=0.24.1",
-        "sqlitedict",
-        "torch>=1.8",
-        "tqdm-multiprocess",
-        "transformers>=4.1",
-        "zstandard",
-    ],
-    extras_require=extras_require,
-)
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
-import pytest
-from itertools import islice
-import lm_eval.tasks as tasks
-from .utilities_testing import load_changed_files, parser
-from typing import List
-from lm_eval.api.task import ConfigurableTask
-import os
-# GitHub CI
-def new_tasks() -> List[str]:
-    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-    if os.path.exists(FILENAME):
-        # If tasks folder has changed then we get the list of files from FILENAME
-        # and parse the yaml files to get the task names.
-        return parser(load_changed_files(FILENAME))
-    elif os.getenv("API") is not None:
-        # Or if API has changed then we set the ENV variable API to True
-        # and run  given tasks.
-        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
-    # if both not true just do arc_easy
-    else:
-        return ["arc_easy"]
-def get_task_class() -> List[ConfigurableTask]:
-    task_name = new_tasks()
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x
-@pytest.fixture()
-def limit() -> int:
-    return 10
-# Tests
-@pytest.mark.parametrize("task_class", get_task_class())
-class TestNewTasks:
-    def test_download(self, task_class: ConfigurableTask):
-        task_class().download()
-        assert task_class().dataset is not None
-    def test_has_training_docs(self, task_class: ConfigurableTask):
-        assert task_class().has_training_docs() in [True, False]
-    def test_check_training_docs(self, task_class: ConfigurableTask):
-        task = task_class()
-        if task.has_training_docs():
-            assert task._config["training_split"] is not None
-    def test_has_validation_docs(self, task_class):
-        assert task_class().has_validation_docs() in [True, False]
-    def test_check_validation_docs(self, task_class):
-        task = task_class()
-        if task.has_validation_docs():
-            assert task._config["validation_split"] is not None
-    def test_has_test_docs(self, task_class):
-        assert task_class().has_test_docs() in [True, False]
-    def test_check_test_docs(self, task_class):
-        task = task_class()
-        if task.has_test_docs():
-            assert task._config["test_split"] is not None
-    def test_should_decontaminate(self, task_class):
-        task = task_class()
-        assert task.should_decontaminate() in [True, False]
-        if task.should_decontaminate():
-            assert task._config["doc_to_decontamination_query"] is not None
-    def test_doc_to_text(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array = [task.doc_to_text(doc) for doc in arr]
-        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-        assert all(
-            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
-            for x in _array
-        )
-    def test_create_choices(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        if "multiple_choice" in task._config.output_type:
-            _array = [task.doc_to_choice(doc) for doc in arr]
-            # assert all(len(x) == 4 for x in _array)
-            assert all(isinstance(x, list) for x in _array)
-            assert all(isinstance(x[0], str) for x in _array)
-    def test_doc_to_target(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array_target = [task.doc_to_target(doc) for doc in arr]
-        if task._config.output_type == "multiple_choice":
-            assert all(isinstance(label, int) for label in _array_target)
-        # _array_text = [task.doc_to_text(doc) for doc in arr]
-        # Not working
-        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-    def test_build_all_requests(self, task_class, limit):
-        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
-        assert task_class.instances is not None
-    # ToDO: Add proper testing
-    def test_construct_requests(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        # assert all(isinstance(doc, list) for doc in requests)
-        assert len(requests) == limit if limit else True
--- a/tests/extra/test_utils.py
+++ b/tests/extra/test_utils.py
-import json
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
-    with open(file_path, "r") as f:
-        return [l for line in f.readlines() for l in line.strip().split(" ")]
-def parser(full_path: List[str]) -> List[str]:
-    _output = set()
-    for x in full_path:
-        if x.endswith(".yaml"):
-            _output.add(load_yaml_config(x)["task"])
-        elif x.endswith(".py"):
-            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
-            _output |= {load_yaml_config(x)["task"] for x in path}
-    return list(_output)
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
 from __future__ import annotations
 import pytest
+from pathlib import Path
 import numpy as np
 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.instance import Instance
 import lm_eval.tasks as tasks
+import sys
+import torch
 class Test_HFLM:
+    torch.use_deterministic_algorithms(True)
+    version_minor = sys.version_info.minor
    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
@@ -90,8 +94,15 @@ class Test_HFLM:
    def test_logliklihood(self) -> None:
        res = self.LM.loglikelihood(self.MULTIPLE_CH)
        _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
-        # change atol in case of consistent failure
+        # log samples to CI
-        assert np.allclose(_res, _RES, atol=1e-4)
+        dir_path = Path("test_logs")
+        dir_path.mkdir(parents=True, exist_ok=True)
+        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
+        file_path = file_path.resolve()
+        with open(file_path, "w") as f:
+            f.write("\n".join(str(x) for x in _res))
+        assert np.allclose(_res, _RES, atol=1e-2)
        # check indices for Multiple Choice
        argmax_RES, argmax_res = np.argmax(
            np.array(_RES).reshape(-1, 4), axis=1

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest
@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 from itertools import islice
 import pytest
-from typing import List
+from .utils import new_tasks
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask
-# Using fixtures to get the task class and limit
-@pytest.fixture()
+# Default Task
-def task_class() -> ConfigurableTask:
+TASKS = ["arc_easy"]
-    task_name = ["arc_easy"]
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x[0]
+def task_class():
+    global TASKS
+    # CI: new_tasks checks if any modifications have been made
+    task_classes = new_tasks()
+    # Check if task_classes is empty
+    if task_classes:
+        return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
+    else:
+        return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
 @pytest.fixture()
@@ -18,109 +26,96 @@ def limit() -> int:
 # Tests
+@pytest.mark.parametrize("task_class", task_class())
+class TestNewTasks:
-def test_download(task_class: ConfigurableTask):
+    def test_download(self, task_class: ConfigurableTask):
-    task_class().download()
+        task_class.download()
-    assert task_class().dataset is not None
+        assert task_class.dataset is not None
+    def test_has_training_docs(self, task_class: ConfigurableTask):
-def test_has_training_docs(task_class: ConfigurableTask):
+        assert task_class.has_training_docs() in [True, False]
-    assert task_class().has_training_docs() in [True, False]
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        if task_class.has_training_docs():
-def test_check_training_docs(task_class: ConfigurableTask):
+            assert task_class._config["training_split"] is not None
-    task = task_class()
-    if task.has_training_docs():
+    def test_has_validation_docs(self, task_class):
-        assert task._config["training_split"] is not None
+        assert task_class.has_validation_docs() in [True, False]
+    def test_check_validation_docs(self, task_class):
-def test_has_validation_docs(task_class):
+        if task_class.has_validation_docs():
-    assert task_class().has_validation_docs() in [True, False]
+            assert task_class._config["validation_split"] is not None
+    def test_has_test_docs(self, task_class):
-def test_check_validation_docs(task_class):
+        assert task_class.has_test_docs() in [True, False]
-    task = task_class()
-    if task.has_validation_docs():
+    def test_check_test_docs(self, task_class):
-        assert task._config["validation_split"] is not None
+        task = task_class
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
-def test_has_test_docs(task_class):
-    assert task_class().has_test_docs() in [True, False]
+    def test_should_decontaminate(self, task_class):
+        task = task_class
+        assert task.should_decontaminate() in [True, False]
-def test_check_test_docs(task_class):
+        if task.should_decontaminate():
-    task = task_class()
+            assert task._config["doc_to_decontamination_query"] is not None
-    if task.has_test_docs():
-        assert task._config["test_split"] is not None
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class
+        arr = (
-def test_should_decontaminate(task_class):
+            list(islice(task.test_docs(), limit))
-    task = task_class()
+            if task.has_test_docs()
-    assert task.should_decontaminate() in [True, False]
+            else list(islice(task.validation_docs(), limit))
-    if task.should_decontaminate():
+        )
-        assert task._config["doc_to_decontamination_query"] is not None
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
-def test_doc_to_text(task_class, limit):
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
-    task = task_class()
+            for x in _array
-    arr = (
+        )
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
+    def test_create_choices(self, task_class, limit):
-        else list(islice(task.validation_docs(), limit))
+        task = task_class
-    )
+        arr = (
-    _array = [task.doc_to_text(doc) for doc in arr]
+            list(islice(task.test_docs(), limit))
-    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+            if task.has_test_docs()
-    assert all(
+            else list(islice(task.validation_docs(), limit))
-        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
+        )
-    )
+        if "multiple_choice" in task._config.output_type:
+            _array = [task.doc_to_choice(doc) for doc in arr]
+            # assert all(len(x) == 4 for x in _array)
-def test_create_choices(task_class, limit):
+            assert all(isinstance(x, list) for x in _array)
-    task = task_class()
+            assert all(isinstance(x[0], str) for x in _array)
-    arr = (
-        list(islice(task.test_docs(), limit))
+    def test_doc_to_target(self, task_class, limit):
-        if task.has_test_docs()
+        task = task_class
-        else list(islice(task.validation_docs(), limit))
+        arr = (
-    )
+            list(islice(task.test_docs(), limit))
-    if "multiple_choice" in task._config.output_type:
+            if task.has_test_docs()
-        _array = [task.doc_to_choice(doc) for doc in arr]
+            else list(islice(task.validation_docs(), limit))
-        # assert all(len(x) == 4 for x in _array)
+        )
-        assert all(isinstance(x, list) for x in _array)
+        _array_target = [task.doc_to_target(doc) for doc in arr]
-        assert all(isinstance(x[0], str) for x in _array)
+        if task._config.output_type == "multiple_choice":
+            assert all(isinstance(label, int) for label in _array_target)
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
-def test_doc_to_target(task_class, limit):
+        # Not working
-    task = task_class()
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-    arr = (
-        list(islice(task.test_docs(), limit))
+    def test_build_all_requests(self, task_class, limit):
-        if task.has_test_docs()
+        task_class.build_all_requests(rank=1, limit=limit, world_size=1)
-        else list(islice(task.validation_docs(), limit))
+        assert task_class.instances is not None
-    )
-    _array_target = [task.doc_to_target(doc) for doc in arr]
+    # ToDO: Add proper testing
-    if task._config.output_type == "multiple_choice":
+    def test_construct_requests(self, task_class, limit):
-        assert all(isinstance(label, int) for label in _array_target)
+        task = task_class
-    # _array_text = [task.doc_to_text(doc) for doc in arr]
+        arr = (
-    # Not working
+            list(islice(task.test_docs(), limit))
-    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
-def test_build_all_requests(task_class, limit):
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+        # assert all(isinstance(doc, list) for doc in requests)
-    assert task_class.instances is not None
+        assert len(requests) == limit if limit else True
-# ToDO: Add proper testing
-def test_construct_requests(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    # assert all(isinstance(doc, list) for doc in requests)
-    assert len(requests) == limit if limit else True
 # def test_create_choices(task_class):

--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
-import json
 from typing import List
 from lm_eval.utils import load_yaml_config
 from pathlib import Path
-import sys
+from typing import Union
+import os
+# {{{CI}}}
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
@@ -14,7 +16,6 @@ def load_changed_files(file_path: str) -> List[str]:
    with open(file_path, "r") as f:
        content = f.read()
        words_list = [x for x in content.split()]
-        sys.stdout.write(f"list of files: {words_list}")
    return words_list
@@ -30,3 +31,18 @@ def parser(full_path: List[str]) -> List[str]:
            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
            _output |= {load_yaml_config(x)["task"] for x in path}
    return list(_output)
+def new_tasks() -> Union[List[str], None]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
+        return parser(load_changed_files(FILENAME))
+    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
+        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
+    # if both not true just do arc_easy
+    else:
+        return