Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
import re
from collections import defaultdict
from lm_eval.decontamination.janitor import (
Janitor,
form_ngrams,
word_ngrams,
split_indices,
word_ngrams,
word_ngrams_indices,
)
......@@ -81,7 +80,6 @@ def test_split_indices():
def test_word_ngrams_indices():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
......@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def test_janitor1():
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
......@@ -165,7 +163,6 @@ def test_janitor1():
def test_janitor2():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
......@@ -214,7 +211,6 @@ def test_janitor2():
def test_janitor3():
# Same test as above but with a 6gram.
sequence = (
......@@ -262,7 +258,6 @@ def test_janitor3():
def test_janitor4():
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
......@@ -318,7 +313,6 @@ def test_janitor4():
def test_janitor5():
# Same as above but using multiple different filth 6grams.
sequence = (
......@@ -374,7 +368,6 @@ def test_janitor5():
def test_janitor6():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence = (
......@@ -438,7 +431,6 @@ def test_janitor6():
def test_janitor7():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence = (
......
import random
import pytest
import lm_eval.api.metrics as metrics
import random
def test_bootstrapping():
......
from itertools import islice
import pytest
from .utils import new_tasks
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks
tasks.initialize_tasks()
# Default Task
TASKS = ["arc_easy"]
......@@ -26,7 +30,7 @@ def limit() -> int:
# Tests
@pytest.mark.parametrize("task_class", task_class())
@pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class.download()
......
from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
import pytest
from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
# noinspection DuplicatedCode
......@@ -220,3 +222,76 @@ def test_make_disjoint_window():
)
assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
class TestCollator:
def make_generate_sample(self, end=10):
strings = ["x" * i for i in range(1, end + 1)]
gen_kwargs1, gen_kwargs2 = (
{"temperature": 0},
{"temperature": 0, "until": ["nn", "\n\n"]},
)
args = [
(string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
for i, string in enumerate(strings)
]
return args
def make_loglikelihood_sample(self, end=11):
samples = [
(("x", "x"), list(range(1, total_length + 1)))
for total_length in range(1, end + 1)
]
return samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
def test_generations(self, batch_size, end):
_collate_gen = lambda x: (-len(x[0]), x[0]) # noqa: E731
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, grouping=True)
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
group_one = end // 2
group_two = end - end // 2
assert (
len(chunks) <= batch_size
if batch_size != 0
else len(chunks) in [group_one, group_two]
)
# check if reorder-er is working correctly
assert all(
len(chunks[i][0]) <= len(chunks[i - 1][0])
for i in range(1, len(chunks))
)
# check if grouping correctly
assert all(x[1] == chunks[0][1] for x in chunks)
for x in chunks:
output.append(x)
reordered_output = gens.get_original(output)
# check get original
assert reordered_output == generation_samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
def test_loglikelihood(self, batch_size, end):
_collate_log = lambda x: (-len(x[1]), tuple(x[1])) # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample(int(end))
loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
for x in chunks:
output.append(x[1])
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
import random
import lm_eval.tasks
import lm_eval.models
import lm_eval.tasks
def test_description():
......@@ -14,7 +15,6 @@ def test_description():
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name]
......
import glob
import logging
import os
from collections import Counter
import shutil
import glob
from collections import Counter
from lm_eval.decontamination.archiver import Archive, TextReader
from lm_eval.decontamination.janitor import Janitor, word_ngrams
from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
from lm_eval.decontamination.archiver import Archive, TextReader
import logging
logger = logging.getLogger(__name__)
......@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
print("rebuild")
rebuilt_ngrams = []
bucket_file_paths = glob.glob(
os.path.join(test_working_directory, "output", f"*.bkt.txt")
os.path.join(test_working_directory, "output", "*.bkt.txt")
)
for bucket_file_path in bucket_file_paths:
reader = TextReader(bucket_file_path)
......
......@@ -2,12 +2,13 @@ import hashlib
import json
import os
import pickle
import pytest
import unittest.mock as mock
import pytest
from openai import OpenAI
import lm_eval.models as models
from openai import OpenAI
client = OpenAI()
......
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import collections
import hashlib
import json
import os
import random
import pytest
import os
import json
import hashlib
import collections
import lm_eval.evaluator as evaluator
import lm_eval.models as models
import lm_eval.tasks as tasks
os.makedirs("tests/testdata", exist_ok=True)
......
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
from typing import Union
import os
from pathlib import Path
from typing import List, Union
from lm_eval.utils import load_yaml_config
# {{{CI}}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment