Merge branch 'big-refactor' into flan-benchmark

2387f39d · Lintang Sutawika · GitHub · 7601d828 · 784fe037 · 2387f39d
Unverified Commit 2387f39d authored Sep 06, 2023 by Lintang Sutawika Committed by GitHub Sep 06, 2023
17 changed files
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Romanian phrase: {{translation["ro"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-ro-en
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
+output_type: greedy_until
+training_split: train
+validation_split: validation
+fewshot_split: validation
+test_split: test
+metric_list:
+  - metric: bleu
+  - metric: ter
+  - metric: chrf
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
--- a/lm_eval/tasks/wikitext/preprocess_wikitext.py
+++ b/lm_eval/tasks/wikitext/preprocess_wikitext.py
@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
    string = string.replace(" 's", "'s")
    return string
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["page"]))
+    _bytes = len(doc["page"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -7,6 +7,7 @@ validation_split: validation
 test_split: test
 doc_to_text: ""
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
 should_decontaminate: true
 doc_to_decontamination_query: "{{page}}"
 metric_list:

--- a/lm_eval/tasks/wsc273/README.md
+++ b/lm_eval/tasks/wsc273/README.md
+# WSC273
+### Paper
+Title: `The Winograd Schema Challenge`
+Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+### Citation
+```
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of any group yet.
+#### Tasks
+* `wsc273`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wsc273/default.yaml
+++ b/lm_eval/tasks/wsc273/default.yaml
+task: wsc273
+dataset_path: winograd_wsc
+dataset_name: wsc273
+output_type: multiple_choice
+test_split: test
+doc_to_text: label
+process_docs: !function utils.process_doc
+doc_to_target: "{% set index = pronoun_loc + pronoun | length %}{{text[index:]}}"
+doc_to_choice: "{% set template = text[:pronoun_loc] %}{{[template+options[0], template+options[1]]}}"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/wsc273/utils.py
+++ b/lm_eval/tasks/wsc273/utils.py
+upper_pronouns = [
+    "A",
+    "An",
+    "The",
+    "She",
+    "He",
+    "It",
+    "They",
+    "My",
+    "His",
+    "Her",
+    "Their",
+]
+def process_doc(dataset):
+    def process_fn(doc):
+        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
+        doc["text"] = doc["text"].replace("  ", " ")
+        doc["options"][0] = __normalize_option(doc, doc["options"][0])
+        doc["options"][1] = __normalize_option(doc, doc["options"][1])
+        return doc
+    return dataset.map(process_fn)
+def __normalize_option(doc, option):
+    # Append `'s` to possessive determiner based options.
+    if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
+        option += "'s"
+    # Appropriately lowercase the pronoun in the option.
+    pronoun = option.split()[0]
+    start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
+    if not start_of_sentence and pronoun in upper_pronouns:
+        return option.replace(pronoun, pronoun.lower())
+    return option
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import collections
 import importlib.util
 import fnmatch
-from typing import List, Literal, Union
+from typing import Iterator, List, Literal, Union
 import gc
 import torch
@@ -65,7 +65,7 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n=0, fn=None):
+def chunks(iter, n: int = 0, fn=None):
    arr = []
    for i, x in enumerate(iter):
        arr.append(x)
@@ -87,11 +87,11 @@ def group(arr, fn):
 class MultiChoice:
-    def __init__(self, choices):
+    def __init__(self, choices) -> None:
        self.choices = choices
    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
+    def __contains__(self, values) -> bool:
        for value in values.split(","):
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.info(f"Available tasks to choose:")
@@ -100,7 +100,7 @@ class MultiChoice:
                raise ValueError("'{}' is not in task list".format(value))
        return True
-    def __iter__(self):
+    def __iter__(self) -> Iterator:
        for choice in self.choices:
            yield choice
@@ -108,7 +108,6 @@ class MultiChoice:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
    if type(patterns) == str:
        patterns = [patterns]
@@ -177,7 +176,7 @@ def make_disjoint_window(pair):
 class Reorderer:
-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        self.size = len(arr)
        arr = list(enumerate(arr))
        arr = group(arr, lambda x: fn(x[1]))
@@ -212,7 +211,7 @@ class Grouper:
    objects in `arr` satisfying `key == fn(ob)`.
    """
-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        # self.orig_arr = arr
        self.size = len(arr)
        arr = list(enumerate(arr))
@@ -263,7 +262,7 @@ class Grouper:
        return res
-def make_table(result_dict, column="results"):
+def make_table(result_dict, column: str = "results"):
    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter
@@ -393,7 +392,6 @@ def get_git_commit_hash():
 def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)
@@ -451,7 +449,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
    return yaml_config
-def regex_replace(string, pattern, repl, count=0):
+def regex_replace(string, pattern, repl, count: int = 0):
    """Implements the `re.sub` function as a custom Jinja filter."""
    return re.sub(pattern, repl, string, count=count)
@@ -525,7 +523,7 @@ def pad_and_concat(
    return torch.cat(tensors, dim=0)
-def clear_torch_cache():
+def clear_torch_cache() -> None:
    gc.collect()
    torch.cuda.empty_cache()
@@ -550,7 +548,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        tokenizer: transformers.PreTrainedTokenizer,
        initial_decoder_input_length: int,
        batch_size: int,
-    ):
+    ) -> None:
        self.initial_decoder_input_length = initial_decoder_input_length
        self.done_tracker = [False] * batch_size
        self.sequence = sequence

--- a/main.py
+++ b/main.py
@@ -9,24 +9,26 @@ from pathlib import Path
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger
+from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
 from lm_eval.benchmarks import include_benchmarks
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def parse_args():
+def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
+    parser.add_argument(
+        "--tasks",
+        default=None,
+        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+    )
    parser.add_argument(
        "--model_args",
        default="",
        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
-    parser.add_argument(
-        "--tasks", default=None  # , choices=utils.MultiChoice(sorted(ALL_TASKS))
-    )
    parser.add_argument(
        "--num_fewshot",
        type=int,
@@ -99,7 +101,7 @@ def parse_args():
    return parser.parse_args()
-def main():
+def main() -> None:
    args = parse_args()
    if args.limit:
@@ -126,10 +128,21 @@ def main():
        else:
            tasks_list = args.tasks.split(",")
            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
+            task_missing = []
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)
+                else:
+                    task_missing.append(task)
+        if task_missing != []:
+            missing = ", ".join(task_missing)
+            eval_logger.error(
+                f"Tasks were not found: {missing}\n"
+                f"{SPACING}Try `lm-eval -h` for list of available tasks",
+            )
+            raise ValueError(f"Tasks {missing} were not found.")
    if args.output_path:
        path = Path(args.output_path)

--- a/mypy.ini
+++ b/mypy.ini
+[mypy]
+python_version = 3.9
+show_traceback = True
+check_untyped_defs = True
+no_implicit_reexport = True
+warn_unreachable = True
+warn_unused_configs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+# We ignore errors everywhere to gradually add type annotations
+[mypy-lm_eval.*]
+ignore_errors = True
+[mypy-lm_eval.api.*]
+ignore_errors = True
+[mypy-lm_eval.prompts.*]
+ignore_errors = True
+[mypy-lm_eval.models.*]
+ignore_errors = True
+[mypy-scripts.*]
+ignore_errors = True
+[mypy-main]
+ignore_errors = True
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ extras_require = {
    ],
    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
+    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
    "promptsource": [
        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
    ],
@@ -53,7 +53,7 @@ setuptools.setup(
    ],
    python_requires=">=3.9",
    install_requires=[
-        "accelerate>=0.18.0",
+        "accelerate>=0.21.0",
        "evaluate",
        "datasets>=2.0.0",
        "evaluate>=0.4.0",
@@ -62,10 +62,9 @@ setuptools.setup(
        "omegaconf>=2.2",
        "peft>=0.2.0",
        "pybind11>=2.6.2",
-        "pycountry",
        "pytablewriter",
        "rouge-score>=0.0.4",
-        "sacrebleu==1.5.0",
+        "sacrebleu>=1.5.0",
        "scikit-learn>=0.24.1",
        "sqlitedict",
        "torch>=1.8",

--- a/tests/extra/__init__.py
+++ b/tests/extra/__init__.py
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
-import pytest
-from itertools import islice
-import lm_eval.tasks as tasks
-from .utilities_testing import load_changed_files, parser
-from typing import List
-from lm_eval.api.task import ConfigurableTask
-import os
-# GitHub CI
-def new_tasks() -> List[str]:
-    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-    if os.path.exists(FILENAME):
-        # If tasks folder has changed then we get the list of files from FILENAME
-        # and parse the yaml files to get the task names.
-        return parser(load_changed_files(FILENAME))
-    elif os.getenv("API") is not None:
-        # Or if API has changed then we set the ENV variable API to True
-        # and run  given tasks.
-        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
-    # if both not true just do arc_easy
-    else:
-        return ["arc_easy"]
-def get_task_class() -> List[ConfigurableTask]:
-    task_name = new_tasks()
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x
-@pytest.fixture()
-def limit() -> int:
-    return 10
-# Tests
-@pytest.mark.parametrize("task_class", get_task_class())
-class TestNewTasks:
-    def test_download(self, task_class: ConfigurableTask):
-        task_class().download()
-        assert task_class().dataset is not None
-    def test_has_training_docs(self, task_class: ConfigurableTask):
-        assert task_class().has_training_docs() in [True, False]
-    def test_check_training_docs(self, task_class: ConfigurableTask):
-        task = task_class()
-        if task.has_training_docs():
-            assert task._config["training_split"] is not None
-    def test_has_validation_docs(self, task_class):
-        assert task_class().has_validation_docs() in [True, False]
-    def test_check_validation_docs(self, task_class):
-        task = task_class()
-        if task.has_validation_docs():
-            assert task._config["validation_split"] is not None
-    def test_has_test_docs(self, task_class):
-        assert task_class().has_test_docs() in [True, False]
-    def test_check_test_docs(self, task_class):
-        task = task_class()
-        if task.has_test_docs():
-            assert task._config["test_split"] is not None
-    def test_should_decontaminate(self, task_class):
-        task = task_class()
-        assert task.should_decontaminate() in [True, False]
-        if task.should_decontaminate():
-            assert task._config["doc_to_decontamination_query"] is not None
-    def test_doc_to_text(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array = [task.doc_to_text(doc) for doc in arr]
-        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-        assert all(
-            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
-            for x in _array
-        )
-    def test_create_choices(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        if "multiple_choice" in task._config.output_type:
-            _array = [task.doc_to_choice(doc) for doc in arr]
-            # assert all(len(x) == 4 for x in _array)
-            assert all(isinstance(x, list) for x in _array)
-            assert all(isinstance(x[0], str) for x in _array)
-    def test_doc_to_target(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array_target = [task.doc_to_target(doc) for doc in arr]
-        if task._config.output_type == "multiple_choice":
-            assert all(isinstance(label, int) for label in _array_target)
-        # _array_text = [task.doc_to_text(doc) for doc in arr]
-        # Not working
-        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-    def test_build_all_requests(self, task_class, limit):
-        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
-        assert task_class.instances is not None
-    # ToDO: Add proper testing
-    def test_construct_requests(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        # assert all(isinstance(doc, list) for doc in requests)
-        assert len(requests) == limit if limit else True
--- a/tests/extra/test_utils.py
+++ b/tests/extra/test_utils.py
-import json
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
-    with open(file_path, "r") as f:
-        return [l for line in f.readlines() for l in line.strip().split(" ")]
-def parser(full_path: List[str]) -> List[str]:
-    _output = set()
-    for x in full_path:
-        if x.endswith(".yaml"):
-            _output.add(load_yaml_config(x)["task"])
-        elif x.endswith(".py"):
-            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
-            _output |= {load_yaml_config(x)["task"] for x in path}
-    return list(_output)
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
 from __future__ import annotations
 import pytest
+from pathlib import Path
 import numpy as np
 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.instance import Instance
 import lm_eval.tasks as tasks
+import sys
+import torch
 class Test_HFLM:
+    torch.use_deterministic_algorithms(True)
+    version_minor = sys.version_info.minor
    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
@@ -90,8 +94,15 @@ class Test_HFLM:
    def test_logliklihood(self) -> None:
        res = self.LM.loglikelihood(self.MULTIPLE_CH)
        _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
-        # change atol in case of consistent failure
+        # log samples to CI
-        assert np.allclose(_res, _RES, atol=1e-4)
+        dir_path = Path("test_logs")
+        dir_path.mkdir(parents=True, exist_ok=True)
+        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
+        file_path = file_path.resolve()
+        with open(file_path, "w") as f:
+            f.write("\n".join(str(x) for x in _res))
+        assert np.allclose(_res, _RES, atol=1e-2)
        # check indices for Multiple Choice
        argmax_RES, argmax_res = np.argmax(
            np.array(_RES).reshape(-1, 4), axis=1

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 from itertools import islice
 import pytest
-from typing import List
+from .utils import new_tasks
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask
-# Using fixtures to get the task class and limit
-@pytest.fixture()
+# Default Task
-def task_class() -> ConfigurableTask:
+TASKS = ["arc_easy"]
-    task_name = ["arc_easy"]
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x[0]
+def task_class():
+    global TASKS
+    # CI: new_tasks checks if any modifications have been made
+    task_classes = new_tasks()
+    # Check if task_classes is empty
+    if task_classes:
+        return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
+    else:
+        return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
 @pytest.fixture()
@@ -18,109 +26,96 @@ def limit() -> int:
 # Tests
+@pytest.mark.parametrize("task_class", task_class())
+class TestNewTasks:
-def test_download(task_class: ConfigurableTask):
+    def test_download(self, task_class: ConfigurableTask):
-    task_class().download()
+        task_class.download()
-    assert task_class().dataset is not None
+        assert task_class.dataset is not None
+    def test_has_training_docs(self, task_class: ConfigurableTask):
-def test_has_training_docs(task_class: ConfigurableTask):
+        assert task_class.has_training_docs() in [True, False]
-    assert task_class().has_training_docs() in [True, False]
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        if task_class.has_training_docs():
-def test_check_training_docs(task_class: ConfigurableTask):
+            assert task_class._config["training_split"] is not None
-    task = task_class()
-    if task.has_training_docs():
+    def test_has_validation_docs(self, task_class):
-        assert task._config["training_split"] is not None
+        assert task_class.has_validation_docs() in [True, False]
+    def test_check_validation_docs(self, task_class):
-def test_has_validation_docs(task_class):
+        if task_class.has_validation_docs():
-    assert task_class().has_validation_docs() in [True, False]
+            assert task_class._config["validation_split"] is not None
+    def test_has_test_docs(self, task_class):
-def test_check_validation_docs(task_class):
+        assert task_class.has_test_docs() in [True, False]
-    task = task_class()
-    if task.has_validation_docs():
+    def test_check_test_docs(self, task_class):
-        assert task._config["validation_split"] is not None
+        task = task_class
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
-def test_has_test_docs(task_class):
-    assert task_class().has_test_docs() in [True, False]
+    def test_should_decontaminate(self, task_class):
+        task = task_class
+        assert task.should_decontaminate() in [True, False]
-def test_check_test_docs(task_class):
+        if task.should_decontaminate():
-    task = task_class()
+            assert task._config["doc_to_decontamination_query"] is not None
-    if task.has_test_docs():
-        assert task._config["test_split"] is not None
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class
+        arr = (
-def test_should_decontaminate(task_class):
+            list(islice(task.test_docs(), limit))
-    task = task_class()
+            if task.has_test_docs()
-    assert task.should_decontaminate() in [True, False]
+            else list(islice(task.validation_docs(), limit))
-    if task.should_decontaminate():
+        )
-        assert task._config["doc_to_decontamination_query"] is not None
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
-def test_doc_to_text(task_class, limit):
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
-    task = task_class()
+            for x in _array
-    arr = (
+        )
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
+    def test_create_choices(self, task_class, limit):
-        else list(islice(task.validation_docs(), limit))
+        task = task_class
-    )
+        arr = (
-    _array = [task.doc_to_text(doc) for doc in arr]
+            list(islice(task.test_docs(), limit))
-    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+            if task.has_test_docs()
-    assert all(
+            else list(islice(task.validation_docs(), limit))
-        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
+        )
-    )
+        if "multiple_choice" in task._config.output_type:
+            _array = [task.doc_to_choice(doc) for doc in arr]
+            # assert all(len(x) == 4 for x in _array)
-def test_create_choices(task_class, limit):
+            assert all(isinstance(x, list) for x in _array)
-    task = task_class()
+            assert all(isinstance(x[0], str) for x in _array)
-    arr = (
-        list(islice(task.test_docs(), limit))
+    def test_doc_to_target(self, task_class, limit):
-        if task.has_test_docs()
+        task = task_class
-        else list(islice(task.validation_docs(), limit))
+        arr = (
-    )
+            list(islice(task.test_docs(), limit))
-    if "multiple_choice" in task._config.output_type:
+            if task.has_test_docs()
-        _array = [task.doc_to_choice(doc) for doc in arr]
+            else list(islice(task.validation_docs(), limit))
-        # assert all(len(x) == 4 for x in _array)
+        )
-        assert all(isinstance(x, list) for x in _array)
+        _array_target = [task.doc_to_target(doc) for doc in arr]
-        assert all(isinstance(x[0], str) for x in _array)
+        if task._config.output_type == "multiple_choice":
+            assert all(isinstance(label, int) for label in _array_target)
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
-def test_doc_to_target(task_class, limit):
+        # Not working
-    task = task_class()
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-    arr = (
-        list(islice(task.test_docs(), limit))
+    def test_build_all_requests(self, task_class, limit):
-        if task.has_test_docs()
+        task_class.build_all_requests(rank=1, limit=limit, world_size=1)
-        else list(islice(task.validation_docs(), limit))
+        assert task_class.instances is not None
-    )
-    _array_target = [task.doc_to_target(doc) for doc in arr]
+    # ToDO: Add proper testing
-    if task._config.output_type == "multiple_choice":
+    def test_construct_requests(self, task_class, limit):
-        assert all(isinstance(label, int) for label in _array_target)
+        task = task_class
-    # _array_text = [task.doc_to_text(doc) for doc in arr]
+        arr = (
-    # Not working
+            list(islice(task.test_docs(), limit))
-    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
-def test_build_all_requests(task_class, limit):
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+        # assert all(isinstance(doc, list) for doc in requests)
-    assert task_class.instances is not None
+        assert len(requests) == limit if limit else True
-# ToDO: Add proper testing
-def test_construct_requests(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    # assert all(isinstance(doc, list) for doc in requests)
-    assert len(requests) == limit if limit else True
 # def test_create_choices(task_class):

--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
-import json
 from typing import List
 from lm_eval.utils import load_yaml_config
 from pathlib import Path
-import sys
+from typing import Union
+import os
+# {{{CI}}}
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
    with open(file_path, "r") as f:
        content = f.read()
        words_list = [x for x in content.split()]
-        sys.stdout.write(f"list of files: {words_list}")
    return words_list
@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]:
            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
            _output |= {load_yaml_config(x)["task"] for x in path}
    return list(_output)
+def new_tasks() -> Union[list[str], None]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
+        return parser(load_changed_files(FILENAME))
+    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
+        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
+    # if both not true just do arc_easy
+    else:
+        return