Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
import re
from collections import defaultdict from collections import defaultdict
from lm_eval.decontamination.janitor import ( from lm_eval.decontamination.janitor import (
Janitor, Janitor,
form_ngrams, form_ngrams,
word_ngrams,
split_indices, split_indices,
word_ngrams,
word_ngrams_indices, word_ngrams_indices,
) )
...@@ -81,7 +80,6 @@ def test_split_indices(): ...@@ -81,7 +80,6 @@ def test_split_indices():
def test_word_ngrams_indices(): def test_word_ngrams_indices():
sequence = ( sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much." " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
...@@ -119,9 +117,9 @@ def test_word_ngrams_indices(): ...@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
# Assumptions from GPT3 Paper: # Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window # the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back. # All tests below initially test without any registered contaminants, expecting the same sequence back.
def test_janitor1(): def test_janitor1():
# First test using a 1gram and expected the first block before the filth to have some remaining # First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed. # characters, but the second block should be completely removed.
...@@ -165,7 +163,6 @@ def test_janitor1(): ...@@ -165,7 +163,6 @@ def test_janitor1():
def test_janitor2(): def test_janitor2():
# Second test using a 1gram and expected the first block before the filth to have some remaining # Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining. # characters, and the second block is longer then 200 characters so should also have some remaining.
...@@ -214,7 +211,6 @@ def test_janitor2(): ...@@ -214,7 +211,6 @@ def test_janitor2():
def test_janitor3(): def test_janitor3():
# Same test as above but with a 6gram. # Same test as above but with a 6gram.
sequence = ( sequence = (
...@@ -262,7 +258,6 @@ def test_janitor3(): ...@@ -262,7 +258,6 @@ def test_janitor3():
def test_janitor4(): def test_janitor4():
# This test adds another block to that from the previous. The middle block should be entirely # This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side. # removed as the 200 characters are removed from each side.
...@@ -318,7 +313,6 @@ def test_janitor4(): ...@@ -318,7 +313,6 @@ def test_janitor4():
def test_janitor5(): def test_janitor5():
# Same as above but using multiple different filth 6grams. # Same as above but using multiple different filth 6grams.
sequence = ( sequence = (
...@@ -374,7 +368,6 @@ def test_janitor5(): ...@@ -374,7 +368,6 @@ def test_janitor5():
def test_janitor6(): def test_janitor6():
# Same as above but now we add 10 filths and expect the same result, the following test does 11. # Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence = ( sequence = (
...@@ -438,7 +431,6 @@ def test_janitor6(): ...@@ -438,7 +431,6 @@ def test_janitor6():
def test_janitor7(): def test_janitor7():
# Same as above but now we add 9 filths and expect the same result, the following test does 10. # Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence = ( sequence = (
......
import random
import pytest import pytest
import lm_eval.api.metrics as metrics import lm_eval.api.metrics as metrics
import random
def test_bootstrapping(): def test_bootstrapping():
......
from itertools import islice from itertools import islice
import pytest import pytest
from .utils import new_tasks
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks
tasks.initialize_tasks() tasks.initialize_tasks()
# Default Task # Default Task
TASKS = ["arc_easy"] TASKS = ["arc_easy"]
...@@ -26,7 +30,7 @@ def limit() -> int: ...@@ -26,7 +30,7 @@ def limit() -> int:
# Tests # Tests
@pytest.mark.parametrize("task_class", task_class()) @pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
class TestNewTasks: class TestNewTasks:
def test_download(self, task_class: ConfigurableTask): def test_download(self, task_class: ConfigurableTask):
task_class.download() task_class.download()
......
from lm_eval.utils import get_rolling_token_windows, make_disjoint_window import pytest
from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
# noinspection DuplicatedCode # noinspection DuplicatedCode
...@@ -220,3 +222,76 @@ def test_make_disjoint_window(): ...@@ -220,3 +222,76 @@ def test_make_disjoint_window():
) )
assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6]) assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6]) assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
class TestCollator:
def make_generate_sample(self, end=10):
strings = ["x" * i for i in range(1, end + 1)]
gen_kwargs1, gen_kwargs2 = (
{"temperature": 0},
{"temperature": 0, "until": ["nn", "\n\n"]},
)
args = [
(string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
for i, string in enumerate(strings)
]
return args
def make_loglikelihood_sample(self, end=11):
samples = [
(("x", "x"), list(range(1, total_length + 1)))
for total_length in range(1, end + 1)
]
return samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
def test_generations(self, batch_size, end):
_collate_gen = lambda x: (-len(x[0]), x[0]) # noqa: E731
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, grouping=True)
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
group_one = end // 2
group_two = end - end // 2
assert (
len(chunks) <= batch_size
if batch_size != 0
else len(chunks) in [group_one, group_two]
)
# check if reorder-er is working correctly
assert all(
len(chunks[i][0]) <= len(chunks[i - 1][0])
for i in range(1, len(chunks))
)
# check if grouping correctly
assert all(x[1] == chunks[0][1] for x in chunks)
for x in chunks:
output.append(x)
reordered_output = gens.get_original(output)
# check get original
assert reordered_output == generation_samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
def test_loglikelihood(self, batch_size, end):
_collate_log = lambda x: (-len(x[1]), tuple(x[1])) # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample(int(end))
loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
for x in chunks:
output.append(x[1])
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
import random import random
import lm_eval.tasks
import lm_eval.models import lm_eval.models
import lm_eval.tasks
def test_description(): def test_description():
...@@ -14,7 +15,6 @@ def test_description(): ...@@ -14,7 +15,6 @@ def test_description():
task_dict = lm_eval.tasks.get_task_dict(task_names) task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up) # patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name] task._config.description = description_dict[task_name]
......
import glob
import logging
import os import os
from collections import Counter
import shutil import shutil
import glob from collections import Counter
from lm_eval.decontamination.archiver import Archive, TextReader
from lm_eval.decontamination.janitor import Janitor, word_ngrams from lm_eval.decontamination.janitor import Janitor, word_ngrams
from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
from lm_eval.decontamination.archiver import Archive, TextReader
import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog): ...@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
print("rebuild") print("rebuild")
rebuilt_ngrams = [] rebuilt_ngrams = []
bucket_file_paths = glob.glob( bucket_file_paths = glob.glob(
os.path.join(test_working_directory, "output", f"*.bkt.txt") os.path.join(test_working_directory, "output", "*.bkt.txt")
) )
for bucket_file_path in bucket_file_paths: for bucket_file_path in bucket_file_paths:
reader = TextReader(bucket_file_path) reader = TextReader(bucket_file_path)
......
...@@ -2,12 +2,13 @@ import hashlib ...@@ -2,12 +2,13 @@ import hashlib
import json import json
import os import os
import pickle import pickle
import pytest
import unittest.mock as mock import unittest.mock as mock
import pytest
from openai import OpenAI
import lm_eval.models as models import lm_eval.models as models
from openai import OpenAI
client = OpenAI() client = OpenAI()
......
import lm_eval.tasks as tasks import collections
import lm_eval.models as models import hashlib
import lm_eval.evaluator as evaluator import json
import os
import random import random
import pytest import pytest
import os
import json import lm_eval.evaluator as evaluator
import hashlib import lm_eval.models as models
import collections import lm_eval.tasks as tasks
os.makedirs("tests/testdata", exist_ok=True) os.makedirs("tests/testdata", exist_ok=True)
......
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
from typing import Union
import os import os
from pathlib import Path
from typing import List, Union
from lm_eval.utils import load_yaml_config
# {{{CI}}} # {{{CI}}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment