Merge branch 'main' into standardize_metrics

cda25fef · Lintang Sutawika · GitHub · dfb41835 · 4d10ad56 · cda25fef
Unverified Commit cda25fef authored Jan 02, 2024 by Lintang Sutawika Committed by GitHub Jan 02, 2024
9 changed files
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
-import re
 from collections import defaultdict

 from lm_eval.decontamination.janitor import (
    Janitor,
    form_ngrams,
-    word_ngrams,
    split_indices,
+    word_ngrams,
    word_ngrams_indices,
 )

@@ -81,7 +80,6 @@ def test_split_indices():


 def test_word_ngrams_indices():
-
    sequence = (
        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window

+
 # All tests below initially test without any registered contaminants, expecting the same sequence back.
 def test_janitor1():
-
    # First test using a 1gram and expected the first block before the filth to have some remaining
    # characters, but the second block should be completely removed.

@@ -165,7 +163,6 @@ def test_janitor1():


 def test_janitor2():
-
    # Second test using a 1gram and expected the first block before the filth to have some remaining
    # characters, and the second block is longer then 200 characters so should also have some remaining.

@@ -214,7 +211,6 @@ def test_janitor2():


 def test_janitor3():
-
    # Same test as above but with a 6gram.

    sequence = (
@@ -262,7 +258,6 @@ def test_janitor3():


 def test_janitor4():
-
    # This test adds another block to that from the previous. The middle block should be entirely
    # removed as the 200 characters are removed from each side.

@@ -318,7 +313,6 @@ def test_janitor4():


 def test_janitor5():
-
    # Same as above but using multiple different filth 6grams.

    sequence = (
@@ -374,7 +368,6 @@ def test_janitor5():


 def test_janitor6():
-
    # Same as above but now we add 10 filths and expect the same result, the following test does 11.

    sequence = (
@@ -438,7 +431,6 @@ def test_janitor6():


 def test_janitor7():
-
    # Same as above but now we add 9 filths and expect the same result, the following test does 10.

    sequence = (

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
+import random
+
 import pytest
+
 import lm_eval.api.metrics as metrics
-import random


 def test_bootstrapping():

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 from itertools import islice
+
 import pytest
-from .utils import new_tasks
+
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask

+from .utils import new_tasks
+
+
 tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]
@@ -26,7 +30,7 @@ def limit() -> int:


 # Tests
-@pytest.mark.parametrize("task_class", task_class())
+@pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
 class TestNewTasks:
    def test_download(self, task_class: ConfigurableTask):
        task_class.download()

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
-from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
+import pytest
+
+from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window


 # noinspection DuplicatedCode
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
    )
    assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
    assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
+
+
+class TestCollator:
+    def make_generate_sample(self, end=10):
+        strings = ["x" * i for i in range(1, end + 1)]
+        gen_kwargs1, gen_kwargs2 = (
+            {"temperature": 0},
+            {"temperature": 0, "until": ["nn", "\n\n"]},
+        )
+        args = [
+            (string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
+            for i, string in enumerate(strings)
+        ]
+
+        return args
+
+    def make_loglikelihood_sample(self, end=11):
+        samples = [
+            (("x", "x"), list(range(1, total_length + 1)))
+            for total_length in range(1, end + 1)
+        ]
+        return samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
+    def test_generations(self, batch_size, end):
+        _collate_gen = lambda x: (-len(x[0]), x[0])  # noqa: E731
+
+        generation_samples = self.make_generate_sample(int(end))
+        gens = Collator(generation_samples, _collate_gen, grouping=True)
+        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            group_one = end // 2
+            group_two = end - end // 2
+            assert (
+                len(chunks) <= batch_size
+                if batch_size != 0
+                else len(chunks) in [group_one, group_two]
+            )
+            # check if reorder-er is working correctly
+            assert all(
+                len(chunks[i][0]) <= len(chunks[i - 1][0])
+                for i in range(1, len(chunks))
+            )
+            # check if grouping correctly
+            assert all(x[1] == chunks[0][1] for x in chunks)
+            for x in chunks:
+                output.append(x)
+        reordered_output = gens.get_original(output)
+        # check get original
+        assert reordered_output == generation_samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
+    def test_loglikelihood(self, batch_size, end):
+        _collate_log = lambda x: (-len(x[1]), tuple(x[1]))  # noqa: E731
+        loglikelihood_samples = self.make_loglikelihood_sample(int(end))
+        loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
+        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
+            # check reorder
+            assert all(
+                len(chunks[i][1]) <= len(chunks[i - 1][1])
+                for i in range(1, len(chunks))
+            )
+            for x in chunks:
+                output.append(x[1])
+        # check indices
+        reordered_output = loglikelihoods.get_original(output)
+        assert reordered_output == [x[1] for x in loglikelihood_samples]
--- a/tests/tests_master/test_description.py
+++ b/tests/tests_master/test_description.py
 import random
-import lm_eval.tasks
+
 import lm_eval.models
+import lm_eval.tasks


 def test_description():
@@ -14,7 +15,6 @@ def test_description():

    task_dict = lm_eval.tasks.get_task_dict(task_names)
    for task_name, task in task_dict.items():
-
        # patch description field in task (# TODO: make this much more cleaned up)
        task._config.description = description_dict[task_name]


--- a/tests/tests_master/test_generate_13_grams.py
+++ b/tests/tests_master/test_generate_13_grams.py
+import glob
+import logging
 import os
-from collections import Counter
 import shutil
-import glob
+from collections import Counter

+from lm_eval.decontamination.archiver import Archive, TextReader
 from lm_eval.decontamination.janitor import Janitor, word_ngrams
 from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
-from lm_eval.decontamination.archiver import Archive, TextReader

-import logging

 logger = logging.getLogger(__name__)

@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
    print("rebuild")
    rebuilt_ngrams = []
    bucket_file_paths = glob.glob(
-        os.path.join(test_working_directory, "output", f"*.bkt.txt")
+        os.path.join(test_working_directory, "output", "*.bkt.txt")
    )
    for bucket_file_path in bucket_file_paths:
        reader = TextReader(bucket_file_path)

--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -2,12 +2,13 @@ import hashlib
 import json
 import os
 import pickle
-import pytest
 import unittest.mock as mock

+import pytest
+from openai import OpenAI
+
 import lm_eval.models as models

-from openai import OpenAI

 client = OpenAI()


--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
-import lm_eval.tasks as tasks
-import lm_eval.models as models
-import lm_eval.evaluator as evaluator
+import collections
+import hashlib
+import json
+import os
 import random
+
 import pytest
-import os
-import json
-import hashlib
-import collections
+
+import lm_eval.evaluator as evaluator
+import lm_eval.models as models
+import lm_eval.tasks as tasks


 os.makedirs("tests/testdata", exist_ok=True)

--- a/tests/utils.py
+++ b/tests/utils.py
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-from typing import Union
 import os
+from pathlib import Path
+from typing import List, Union
+
+from lm_eval.utils import load_yaml_config


 # {{{CI}}}