resolved merge conflict from latest version

3e1301bb · lintangsutawika · fd9cd80f · 070d31df · 3e1301bb · 3e1301bb
Commit 3e1301bb authored Jun 04, 2024 by lintangsutawika
19 changed files
--- a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_ner
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_rouge
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_wrapper.py
+++ b/lm_eval/tasks/unitxt/unitxt_wrapper.py
+try:
+    from unitxt import evaluate
+except ImportError:
+    raise ImportError(
+        "Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
+    )
+
+from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric
+
+
+def unitxt_agg_metric(items):
+    preds = [pred[0] for pred, _, _ in items]
+    refs = [ref for _, ref, _ in items]
+    metric_name = items[0][2].replace("unitxt_", "metrics.")
+    for ref in refs:
+        ref["metrics"] = [metric_name]
+
+    result_metrics = evaluate(preds, refs)
+    return result_metrics[0]["score"]["global"]["score"]
+
+
+AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric
+
+
+def unitxt_metric(items):  # This is a passthrough function
+    return items
+
+
+def process_results(doc, results):
+    metrics = doc["metrics"]
+    scores = {}
+    for metric in metrics:
+        metric = metric.replace("metrics.", "unitxt_")
+        scores[metric] = (results, doc, metric)
+
+        if metric not in METRIC_REGISTRY:
+            register_metric(
+                metric=metric,
+                higher_is_better=True,
+                output_type="generate_until",
+                aggregation="unitxt",
+            )(unitxt_metric)
+    return scores
+
+
+#
--- a/lm_eval/tasks/unitxt/xsum.yaml
+++ b/lm_eval/tasks/unitxt/xsum.yaml
+include: unitxt_tasks.summarization.abstractive
+task: xsum
+dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full
--- a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+++ b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+include: unitxt_tasks.classification.multi_class
+task: yahoo_answers_topics
+dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")

 SPACING = " " * 47

+HIGHER_IS_BETTER_SYMBOLS = {
+    True: "↑",
+    False: "↓",
+}
+

 def hash_string(string: str) -> str:
    return hashlib.sha256(string.encode("utf-8")).hexdigest()
@@ -76,6 +81,18 @@ def handle_non_serializable(o):
        return str(o)


+def sanitize_list(sub):
+    """
+    Takes possible nested list and recursively converts all inner component to strings
+    """
+    if isinstance(sub, list):
+        return [sanitize_list(item) for item in sub]
+    if isinstance(sub, tuple):
+        return tuple(sanitize_list(item) for item in sub)
+    else:
+        return str(sub)
+
+
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -257,6 +274,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
        "Filter",
        "n-shot",
        "Metric",
+        "",
        "Value",
        "",
        "Stderr",
@@ -276,10 +294,8 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
    for k in keys:
        dic = result_dict[column][k]
        version = result_dict["versions"].get(k, "    N/A")
-        if k in result_dict["n-shot"]:
-            n = str(result_dict["n-shot"][k])
-        else:
-            n = " "
+        n = str(result_dict.get("n-shot", " ").get(k, " "))
+        higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})

        if "alias" in dic:
            k = dic.pop("alias")
@@ -290,13 +306,16 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
                continue
            if v != " ":
                v = "%.4f" % v
+
+            hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
+
            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
                if se != "N/A":
                    se = "%.4f" % se
-                values.append([k, version, f, n, m, v, "±", se])
+                values.append([k, version, f, n, m, hib, v, "±", se])
            else:
-                values.append([k, version, f, n, m, v, "", ""])
+                values.append([k, version, f, n, m, hib, v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 requires-python = ">=3.8"
 license = { "text" = "MIT" }
 dependencies = [
-    "accelerate>=0.21.0",
+    "accelerate>=0.26.0",
    "evaluate",
    "datasets>=2.16.0",
    "evaluate>=0.4.0",
@@ -39,6 +39,7 @@ dependencies = [
    "dill",
    "word2number",
    "more_itertools",
+    "shortuuid",
 ]

 [tool.setuptools.packages.find]
@@ -73,9 +74,10 @@ promptsource = ["promptsource>=0.2.3"]
 sentencepiece = ["sentencepiece>=0.1.98"]
 sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-vllm = ["vllm==0.3.2"]
+vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
+unitxt = ["unitxt"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
@@ -94,6 +96,7 @@ all = [
    "lm_eval[vllm]",
    "lm_eval[zeno]",
    "lm_eval[wandb]",
+    "lm_eval[unitxt]"
 ]

 [tool.ruff.lint]

--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
 def gguf_completion_mock(base_url=None, **kwargs):
    # Generate a hash from the parameters
    hash_kwargs = {"base_url": base_url, **kwargs}
-    hash = hashlib.sha256(
+    parameters_hash = hashlib.sha256(
        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
    ).hexdigest()

-    fname = f"./tests/testdata/gguf_test_{hash}.pkl"
+    fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"

    if os.path.exists(fname):
        with open(fname, "rb") as fh:

--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
 from __future__ import annotations

+import os
 import sys
 from pathlib import Path

 import numpy as np
 import torch

-import lm_eval.tasks as tasks
+from lm_eval import tasks
 from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM


+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 task_manager = tasks.TaskManager()

+TEST_STRING = "foo bar"
+

 class Test_HFLM:
    torch.use_deterministic_algorithms(True)
@@ -107,7 +111,7 @@ class Test_HFLM:

        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
        file_path = file_path.resolve()
-        with open(file_path, "w") as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write("\n".join(str(x) for x in _res))
        assert np.allclose(_res, _RES, atol=1e-2)
        # check indices for Multiple Choice
@@ -126,19 +130,19 @@ class Test_HFLM:
        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)

    def test_toc_encode(self) -> None:
-        res = self.LM.tok_encode("foo bar")
+        res = self.LM.tok_encode(TEST_STRING)
        assert res == [12110, 2534]

    def test_toc_decode(self) -> None:
        res = self.LM.tok_decode([12110, 2534])
-        assert res == "foo bar"
+        assert res == TEST_STRING

    def test_batch_encode(self) -> None:
-        res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
+        res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
        assert res == [[12110, 2534], [2009, 17374]]

    def test_model_generate(self) -> None:
-        context = self.LM.tok_batch_encode(["foo bar"])[0]
+        context = self.LM.tok_batch_encode([TEST_STRING])[0]
        res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
        res = self.LM.tok_decode(res[0])
        assert res == "foo bar\n<bazhang>!info bar"
--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
 import pytest

-import lm_eval.evaluator as evaluator
+from lm_eval import evaluator
 from lm_eval.api.registry import get_model



--- a/tests/models/test_openvino.py
+++ b/tests/models/test_openvino.py
@@ -6,7 +6,7 @@ import pytest
 from optimum.intel import OVModelForCausalLM
 from transformers import AutoTokenizer

-import lm_eval.evaluator as evaluator
+from lm_eval import evaluator
 from lm_eval.api.registry import get_model


@@ -46,7 +46,7 @@ def test_evaluator(model_id, task):

            random.seed(42)
            for _ in reqs:
-                res.append((-random.random(), False))
+                res.extend([(-random.random(), False)])

            return res

@@ -57,7 +57,7 @@ def test_evaluator(model_id, task):
            res = []
            random.seed(42)
            for _ in reqs:
-                res.append(-random.random())
+                res.extend([-random.random()])

            return res

@@ -79,7 +79,7 @@ def test_ov_config():
    model_id = "hf-internal-testing/tiny-random-gpt2"
    with tempfile.TemporaryDirectory() as tmpdirname:
        config_file = str(Path(tmpdirname) / "ov_config.json")
-        with open(Path(config_file), "w") as f:
+        with open(Path(config_file), "w", encoding="utf-8") as f:
            f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
        lm = get_model("openvino").create_from_arg_string(
            f"pretrained={model_id},ov_config={config_file}"

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -3,7 +3,7 @@ from typing import List
 import pytest
 import torch

-import lm_eval.tasks as tasks
+from lm_eval import tasks
 from lm_eval.api.instance import Instance



--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
-# import lm_eval.base as base
+import os
 from typing import List

 import pytest

-# import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 from lm_eval import tasks


+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces


 @pytest.mark.parametrize(
-    "task_name,limit,model,model_args",
+    "task_name,limit,model,model_args,bootstrap_iters",
    [
        (
            ["arc_easy"],
            10,
            "hf",
            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+            0,
        ),
        (
            ["mmlu_abstract_algebra"],
            None,
            "hf",
            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+            10000,
        ),
    ],
 )
-def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
-    # task_name = task_name
-    # limit = 10
-
+def test_evaluator(
+    task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
+):
    e1 = evaluator.simple_evaluate(
        model=model,
        tasks=task_name,
        limit=limit,
        model_args=model_args,
+        bootstrap_iters=bootstrap_iters,
    )
    assert e1 is not None

@@ -57,6 +59,7 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
        lm=lm,
        task_dict=task_dict,
        limit=limit,
+        bootstrap_iters=bootstrap_iters,
    )

    assert e2 is not None

--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
+import os
 from collections import defaultdict

 from lm_eval.decontamination.janitor import (
@@ -9,23 +10,41 @@ from lm_eval.decontamination.janitor import (
 )


+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+TEST_SEQUENCE = (
+    "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
+    " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+)
+
+JANITOR_EXPECTED = (
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+    "This is a @line #containing "
+    " characters, 76 to be exact. "
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+    "This is a @line #containing a certain number of characters, 76 to be exact. "
+)
+
+JANITOR_FILTH1 = "filth lots of dirty filthy filth"
+JANITOR_FILTH2 = "filth lots of filthy dirty filth"
+
+
 def simple_ngram(sequence, n):
    ngrams = list()
    ngram = []
    for x in sequence:
-        ngram.append(x)
+        ngram.extend([x])
        if len(ngram) == n:
-            ngrams.append(tuple(ngram))
+            ngrams.extend([tuple(ngram)])
            ngram = ngram[1:]

    return ngrams


 def test_form_ngrams():
-    sequence = (
-        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
-        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
-    )
+    sequence = TEST_SEQUENCE

    n_values = [1, 2, 3, 5, 13]
    for n in n_values:
@@ -36,10 +55,7 @@ def test_form_ngrams():


 def test_word_ngrams():
-    sequence = (
-        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
-        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
-    )
+    sequence = TEST_SEQUENCE

    words = sequence.split()

@@ -53,10 +69,7 @@ def test_word_ngrams():


 def test_split_indices():
-    sequence = (
-        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
-        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
-    )
+    sequence = TEST_SEQUENCE

    comparison = []
    current_word = ""
@@ -65,12 +78,18 @@ def test_split_indices():
            current_word += c
        else:
            if current_word:
-                comparison.append((current_word, (i - len(current_word), i - 1)))
+                comparison.extend([(current_word, (i - len(current_word), i - 1))])
                current_word = ""

    if current_word:
-        comparison.append(
-            (current_word, (len(sequence) - len(current_word), len(sequence) - 1))
+        len_sequence = len(sequence)
+        comparison.extend(
+            [
+                (
+                    current_word,
+                    (len_sequence - len(current_word), len_sequence - 1),
+                )
+            ]
        )
        current_word = ""

@@ -80,10 +99,7 @@ def test_split_indices():


 def test_word_ngrams_indices():
-    sequence = (
-        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
-        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
-    )
+    sequence = TEST_SEQUENCE

    n_values = [1, 2, 3, 5, 13]

@@ -100,14 +116,13 @@ def test_word_ngrams_indices():
                tracker[ngram] = end + 1

                # ignore partial word matches
-                if (start != 0 and sequence[start - 1] != " ") or (
-                    end != len(sequence) - 1 and sequence[end + 1] != " "
+                if not (
+                    (start != 0 and sequence[start - 1] != " ")
+                    or (end != len(sequence) - 1 and sequence[end + 1] != " ")
                ):
-                    pass
-                else:
                    break

-            comparison.append((ngram, (start, end)))
+            comparison.extend([(ngram, (start, end))])

        result_to_test = list(word_ngrams_indices(sequence, n))
        assert len(result_to_test) == len(comparison)
@@ -184,17 +199,6 @@ def test_janitor2():

    filth = "filth"

-    expected_result = (
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing "
-        " characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-    )
-
    janitor = Janitor(
        ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
    )
@@ -207,7 +211,7 @@ def test_janitor2():

    result = janitor.clean_python(sequence)
    result = "".join(result)
-    assert result == expected_result
+    assert result == JANITOR_EXPECTED


 def test_janitor3():
@@ -229,19 +233,6 @@ def test_janitor3():
        "This is a @line #containing a certain number of characters, 76 to be exact. "
    )

-    filth = "filth lots of dirty filthy filth"
-
-    expected_result = (
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing "
-        " characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-    )
-
    janitor = Janitor(
        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
    )
@@ -249,12 +240,12 @@ def test_janitor3():
    result = "".join(result)
    assert result == sequence

-    janitor.register_contaminant(filth)
-    assert janitor.dirt_ngrams == {filth}
+    janitor.register_contaminant(JANITOR_FILTH1)
+    assert janitor.dirt_ngrams == {JANITOR_FILTH1}

    result = janitor.clean_python(sequence)
    result = "".join(result)
-    assert result == expected_result
+    assert result == JANITOR_EXPECTED


 def test_janitor4():
@@ -284,19 +275,6 @@ def test_janitor4():
        "This is a @line #containing a certain number of characters, 76 to be exact. "
    )

-    filth = "filth lots of dirty filthy filth"
-
-    expected_result = (
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing "
-        " characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-    )
-
    janitor = Janitor(
        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
    )
@@ -304,12 +282,12 @@ def test_janitor4():
    result = "".join(result)
    assert result == sequence

-    janitor.register_contaminant(filth)
-    assert janitor.dirt_ngrams == {filth}
+    janitor.register_contaminant(JANITOR_FILTH1)
+    assert janitor.dirt_ngrams == {JANITOR_FILTH1}

    result = janitor.clean_python(sequence)
    result = "".join(result)
-    assert result == expected_result
+    assert result == JANITOR_EXPECTED


 def test_janitor5():
@@ -338,18 +316,7 @@ def test_janitor5():
        "This is a @line #containing a certain number of characters, 76 to be exact. "
    )

-    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
-
-    expected_result = (
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing "
-        " characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-    )
+    filths = [JANITOR_FILTH1, JANITOR_FILTH2]

    janitor = Janitor(
        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
@@ -364,7 +331,7 @@ def test_janitor5():

    result = janitor.clean_python(sequence)
    result = "".join(result)
-    assert result == expected_result
+    assert result == JANITOR_EXPECTED


 def test_janitor6():
@@ -401,18 +368,7 @@ def test_janitor6():
        "This is a @line #containing a certain number of characters, 76 to be exact. "
    )

-    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
-
-    expected_result = (
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing "
-        " characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-        "This is a @line #containing a certain number of characters, 76 to be exact. "
-    )
+    filths = [JANITOR_FILTH1, JANITOR_FILTH2]

    janitor = Janitor(
        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
@@ -427,7 +383,7 @@ def test_janitor6():

    result = janitor.clean_python(sequence)
    result = "".join(result)
-    assert result == expected_result
+    assert result == JANITOR_EXPECTED


 def test_janitor7():
@@ -465,7 +421,7 @@ def test_janitor7():
        "This is a @line #containing a certain number of characters, 76 to be exact. "
    )

-    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
+    filths = [JANITOR_FILTH1, JANITOR_FILTH2]

    expected_result = ""

@@ -488,20 +444,3 @@ def test_janitor7():
 def test_janitor8():
    # This will test the save and load contams
    pass
-    # source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-    # contaminant = "dirty boy. Clean he he"
-
-    # jan = Janitor(ngram_n=3)
-    # jan.register_contaminant(contaminant)
-    # cleaned = " ".join(jan.clean(source))
-    # for contam in jan.dirt_ngrams:
-    #     assert contam not in cleaned, contam
-
-    # filename = "data/saved_contam"
-    # jan.save_contamination_ngrams(filename)
-
-    # jan = Janitor(ngram_n=3)
-    # jan.load_contamination_ngrams(filename)
-    # cleaned = " ".join(jan.clean(source))
-    # for contam in jan.dirt_ngrams:
-    #     assert contam not in cleaned, contam
--- a/tests/test_requests_caching.py
+++ b/tests/test_requests_caching.py
-# import lm_eval.base as base
 import importlib
 import os
 import sys
 from datetime import datetime
-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import pytest
 import torch

-# import lm_eval.models as models
 from lm_eval.caching.cache import PATH


@@ -43,7 +41,7 @@ def clear_cache():


 # leaving tasks here to allow for the option to select specific task files
-def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
+def get_cache_files(tasks: Optional[List[str]] = None) -> Tuple[List[str], List[str]]:
    cache_files = os.listdir(PATH)

    file_task_names = []
@@ -51,7 +49,7 @@ def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
    for file in cache_files:
        file_without_prefix = file.split("-")[1]
        file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
-        file_task_names.append(file_without_prefix_and_suffix)
+        file_task_names.extend([file_without_prefix_and_suffix])

    return cache_files, file_task_names

@@ -113,10 +111,11 @@ if __name__ == "__main__":
            # test_requests_caching_refresh,
            # test_requests_caching_delete,
        ]
-
+        # Lookups of global names within a loop is inefficient, so copy to a local variable outside of the loop first
+        default_tasks = DEFAULT_TASKS
        for test_func in tests:
            clear_cache()
-            test_func(tasks=DEFAULT_TASKS)
+            test_func(tasks=default_tasks)

        print("Tests pass")


--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
+import os
 from itertools import islice

 import pytest
@@ -8,6 +9,7 @@ from lm_eval.api.task import ConfigurableTask
 from .utils import new_tasks


+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 task_manager = tasks.TaskManager()
 # Default Task
 TASKS = ["arc_easy"]
@@ -87,7 +89,6 @@ class TestNewTasks:
        )
        if "multiple_choice" in task._config.output_type:
            _array = [task.doc_to_choice(doc) for doc in arr]
-            # assert all(len(x) == 4 for x in _array)
            assert all(isinstance(x, list) for x in _array)
            assert all(isinstance(x[0], str) for x in _array)

@@ -101,9 +102,6 @@ class TestNewTasks:
        _array_target = [task.doc_to_target(doc) for doc in arr]
        if task._config.output_type == "multiple_choice":
            assert all(isinstance(label, int) for label in _array_target)
-        # _array_text = [task.doc_to_text(doc) for doc in arr]
-        # Not working
-        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))

    def test_build_all_requests(self, task_class, limit):
        task_class.build_all_requests(rank=1, limit=limit, world_size=1)
@@ -118,5 +116,4 @@ class TestNewTasks:
            else list(islice(task.validation_docs(), limit))
        )
        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        # assert all(isinstance(doc, list) for doc in requests)
        assert len(requests) == limit if limit else True
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -41,7 +41,7 @@ def test_get_rolling_token_windows_v1():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -70,7 +70,7 @@ def test_get_rolling_token_windows_v2():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -115,7 +115,7 @@ def test_get_rolling_token_windows_v3():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -156,7 +156,7 @@ def test_get_rolling_token_windows_v4():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -185,7 +185,7 @@ def test_get_rolling_token_windows_v5():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -210,7 +210,7 @@ def test_get_rolling_token_windows_v6():
    pred_length = 0
    output = []
    for input_tokens, pred_tokens in generator:
-        output.append((input_tokens, pred_tokens))
+        output.extend([(input_tokens, pred_tokens)])
        pred_length += len(pred_tokens)
    assert pred_length == len(x)
    assert gold == output
@@ -273,26 +273,26 @@ class TestCollator:

        generation_samples = self.make_generate_sample(int(end))
        gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
-        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
+        chunks_gen = gens.get_batched(n=int(batch_size), batch_fn=None)
        output = []
-        for chunks in chunks:
+        group_one = end // 2
+        group_two = end - end // 2
+        is_batch = batch_size != 0
+        for chunks in chunks_gen:
            # check batching
-            group_one = end // 2
-            group_two = end - end // 2
            assert (
                len(chunks) <= batch_size
-                if batch_size != 0
+                if is_batch
                else len(chunks) in [group_one, group_two]
            )
            # check if reorder-er is working correctly
-            assert all(
-                len(chunks[i][0]) <= len(chunks[i - 1][0])
-                for i in range(1, len(chunks))
-            )
+            chunk_lengths = [len(chunk[0]) for chunk in chunks]
+            assert chunk_lengths == sorted(chunk_lengths, reverse=True)
            # check if grouping correctly
-            assert all(x[1] == chunks[0][1] for x in chunks)
+            chunk_to_compare = chunks[0][1]
+            assert all(x[1] == chunk_to_compare for x in chunks)
            for x in chunks:
-                output.append(x)
+                output.extend([x])
        reordered_output = gens.get_original(output)
        # check get original
        assert reordered_output == generation_samples
@@ -305,18 +305,17 @@ class TestCollator:
            loglikelihood_samples,
            _collate_log,
        )
-        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        chunks_gen = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
        output = []
-        for chunks in chunks:
+        is_batch = batch_size != 0
+        for chunks in chunks_gen:
            # check batching
-            assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
+            assert len(chunks) <= batch_size if is_batch else len(chunks) == end
            # check reorder
-            assert all(
-                len(chunks[i][1]) <= len(chunks[i - 1][1])
-                for i in range(1, len(chunks))
-            )
+            chunk_lengths = [len(chunk[1]) for chunk in chunks]
+            assert chunk_lengths == sorted(chunk_lengths, reverse=True)
            for x in chunks:
-                output.append(x[1])
+                output.extend([x[1]])
        # check indices
        reordered_output = loglikelihoods.get_original(output)
        assert reordered_output == [x[1] for x in loglikelihood_samples]
@@ -335,18 +334,17 @@ class TestCollator:
            group_fn=lambda a: a[-2] + a[-1][:-1],
            group_by="contexts",
        )
-        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        chunks_gen = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
        output = []
        outputs_ = []
-        for chunks in chunks:
+        is_batch = batch_size != 0
+        for chunks in chunks_gen:
            # check batching
-            if batch_size != 0:
+            if is_batch:
                assert len(chunks) <= batch_size
            # check reorder
-            assert all(
-                len(chunks[i][1]) <= len(chunks[i - 1][1])
-                for i in range(1, len(chunks))
-            )
+            chunk_lengths = [len(chunk[1]) for chunk in chunks]
+            assert chunk_lengths == sorted(chunk_lengths, reverse=True)
            for x in chunks:
                for request_str, cont_toks, logits in loglikelihoods.get_cache(
                    req_str="".join(x[0]),
@@ -356,8 +354,8 @@ class TestCollator:
                    .unsqueeze(0)
                    .unsqueeze(0),
                ):
-                    output.append(x[1])
-                    outputs_.append(cont_toks)
+                    output.extend([x[1]])
+                    outputs_.extend([cont_toks])
        assert len(output) == len(outputs_)
        # check indices
        reordered_output = loglikelihoods.get_original(output)

--- a/tests/testyamls/test-01.yaml
+++ b/tests/testyamls/test-01.yaml
@@ -3,12 +3,12 @@ group_alias: test 1
 task:
  - piqa # string task
  - ai2_arc # string tag
-  - task: super-glue-lm-eval-v1 # Should this be spread out?
-    num_fewshot: 3
+  # - task: super-glue-lm-eval-v1 # Should this be spread out?
+  #   num_fewshot: 3
  - task: swag # dict registered task
    num_fewshot: 2
-  - task: mmlu
-    num_fewshot: 5
+  # - task: mmlu
+  #   num_fewshot: 5
  - group: nli-tasks # dict group
    task:
      - anli
@@ -17,29 +17,31 @@ task:
    num_fewshot: 4
    metric_list:
      - metric: brier_score
-  - task: sciq # dict registered task duplicate
-    task_alias: sciq 2-shot
-    num_fewshot: 2
-  - task: sciq # dict registered task duplicate
-    task_alias: sciq 4-shot
-    num_fewshot: 4
-  - task: sciq # dict registered task duplicate
-    task_alias: sciq 6-shot
-    num_fewshot: 6
-  - task: siqa_custom # dict task
-    dataset_path: social_i_qa
-    dataset_name: null
-    output_type: multiple_choice
-    training_split: train
-    validation_split: validation
-    doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
-    target_delimiter: " "
-    doc_to_choice:
-      - "{{answerA}}"
-      - "{{answerB}}"
-      - "{{answerC}}"
-    doc_to_target: "{{ (label|int) - 1 }}"
-    metric_list:
-      - metric: acc
-        aggregation: mean
-        higher_is_better: true
+    aggregate_metric: true
+    
+  # - task: sciq # dict registered task duplicate
+  #   task_alias: sciq 2-shot
+  #   num_fewshot: 2
+  # - task: sciq # dict registered task duplicate
+  #   task_alias: sciq 4-shot
+  #   num_fewshot: 4
+  # - task: sciq # dict registered task duplicate
+  #   task_alias: sciq 6-shot
+  #   num_fewshot: 6
+  # - task: siqa_custom # dict task
+  #   dataset_path: social_i_qa
+  #   dataset_name: null
+  #   output_type: multiple_choice
+  #   training_split: train
+  #   validation_split: validation
+  #   doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
+  #   target_delimiter: " "
+  #   doc_to_choice:
+  #     - "{{answerA}}"
+  #     - "{{answerB}}"
+  #     - "{{answerC}}"
+  #   doc_to_target: "{{ (label|int) - 1 }}"
+  #   metric_list:
+  #     - metric: acc
+  #       aggregation: mean
+  #       higher_is_better: true
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -12,9 +12,9 @@ from lm_eval.utils import load_yaml_config
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
-    with open(file_path, "r") as f:
+    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
-        words_list = [x for x in content.split()]
+        words_list = list(content.split())
    return words_list


@@ -25,7 +25,7 @@ def load_changed_files(file_path: str) -> List[str]:
 def parser(full_path: List[str]) -> List[str]:
    _output = set()
    for x in full_path:
-        if os.path.exists(x) and x.endswith(".yaml"):
+        if x.endswith(".yaml") and os.path.exists(x):
            config = load_yaml_config(x, mode="simple")
            if isinstance(config["task"], str):
                _output.add(config["task"])
@@ -40,10 +40,9 @@ def new_tasks() -> Union[List[str], None]:
        # If tasks folder has changed then we get the list of files from FILENAME
        # and parse the yaml files to get the task names.
        return parser(load_changed_files(FILENAME))
-    elif os.getenv("API") is not None:
+    if os.getenv("API") is not None:
        # Or if API has changed then we set the ENV variable API to True
        # and run  given tasks.
        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
    # if both not true just do arc_easy
-    else:
-        return
+    return None