solved merge conflict

0d1ef037 · lintangsutawika · aa44be3f · ada4a31d · 0d1ef037 · 0d1ef037
Commit 0d1ef037 authored Jan 17, 2024 by lintangsutawika
20 changed files
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -8,18 +8,18 @@ Arguments
    directory and the unsorted buckets are removed after.
 """

-import glob
 import argparse
+import glob
+import logging
 import os
 import signal
-from signal import SIGINT
 import subprocess
+from signal import SIGINT

 from tqdm import tqdm
-
-import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm

+
 logger = logging.getLogger(__name__)

 terminate = False
@@ -31,7 +31,7 @@ def handler(signal_received, frame):


 def sort_13_gram_buckets(working_directory):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))

    for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
        sorted_file_path = bucket_file_path + ".sorted"
@@ -49,7 +49,6 @@ parser = argparse.ArgumentParser(description="sort 13gram buckets")
 parser.add_argument("-dir", "--working_directory", default="")

 if __name__ == "__main__":
-
    version = 1.00
    print(f"Running version {version}")


--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
 import random
+
 import transformers
-from lm_eval import tasks, evaluator
+
+from lm_eval import evaluator, tasks
 from lm_eval.base import LM



--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
-from lm_eval import tasks
 from itertools import islice

+from lm_eval import tasks
+
+
 ct = 3

 for (

--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
-import transformers
+import random

 import torch
 import torch.nn.functional as F
-import random
+import transformers
+

 random.seed(42)


--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,10 +2,11 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
+import json
 import logging
-from pytablewriter import MarkdownTableWriter, LatexTableWriter
 import os
-import json
+
+from pytablewriter import LatexTableWriter, MarkdownTableWriter


 logging.basicConfig(level=logging.INFO)

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -4,9 +4,11 @@ Usage:
 """
 import argparse
 import logging
-from lm_eval import tasks
+
 from pytablewriter import MarkdownTableWriter

+from lm_eval import tasks
+

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
 import argparse
+import os
+from typing import Dict, List, Tuple
+
 import numpy as np
-import lm_eval.evaluator
-from lm_eval import tasks
-from lm_eval import utils
-import scipy.stats
-from typing import Tuple, Dict, List
 import pandas as pd
+import scipy.stats
 import torch
-import os
+
+import lm_eval.evaluator
+from lm_eval import tasks, utils
+

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 eval_logger = utils.eval_logger

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -5,7 +5,7 @@ import subprocess
 import time
 from pathlib import Path

-from lm_eval import evaluator, utils
+from lm_eval import utils
 from lm_eval.api.registry import ALL_TASKS


@@ -136,14 +136,16 @@ def main():
    args = parse_args()

    args.branches = (
-        args.branches.split(",") if type(args.branches) == str else args.branches
+        args.branches.split(",") if isinstance(args.branches, str) else args.branches
+    )
+    args.models = (
+        args.models.split(",") if isinstance(args.models, str) else args.models
    )
-    args.models = args.models.split(",") if type(args.models) == str else args.models
    args.tasks = (
        ALL_TASKS
        if args.tasks == "all_tasks"
        else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
-        if type(args.tasks) == str
+        if isinstance(args.tasks, str)
        else args.tasks
    )


--- a/scripts/write_out.py
+++ b/scripts/write_out.py
 import argparse
-import numpy as np
-import json
 import os
 import random
+
+import numpy as np
+
 from lm_eval import tasks
-from lm_eval.utils import join_iters, eval_logger
-from lm_eval.tasks import initialize_tasks, include_path
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import eval_logger, join_iters
+

 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"


--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+
+import pandas as pd
+from zeno_client import ZenoClient, ZenoMetric
+
+from lm_eval.utils import eval_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
+    )
+    parser.add_argument(
+        "--data_path",
+        required=True,
+        help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
+    )
+    parser.add_argument(
+        "--project_name",
+        required=True,
+        help="The name of the generated Zeno project.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
+
+    This scripts expects your results to live in a data folder where subfolders contain results of individual models.
+    """
+    args = parse_args()
+
+    client = ZenoClient(os.environ["ZENO_API_KEY"])
+
+    # Get all model subfolders from the parent data folder.
+    models = [
+        os.path.basename(os.path.normpath(f))
+        for f in os.scandir(Path(args.data_path))
+        if f.is_dir()
+    ]
+
+    assert len(models) > 0, "No model directories found in the data_path."
+
+    tasks = set(tasks_for_model(models[0], args.data_path))
+
+    for model in models:  # Make sure that all models have the same tasks.
+        old_tasks = tasks.copy()
+        task_count = len(tasks)
+
+        model_tasks = tasks_for_model(model, args.data_path)
+        tasks.intersection(set(model_tasks))
+
+        if task_count != len(tasks):
+            eval_logger.warning(
+                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
+            )
+
+    assert (
+        len(tasks) > 0
+    ), "Must provide at least one task in common amongst models to compare."
+
+    for task in tasks:
+        # Upload data for all models
+        for model_index, model in enumerate(models):
+            model_args = re.sub(
+                "/|=",
+                "__",
+                json.load(open(Path(args.data_path, model, "results.json")))["config"][
+                    "model_args"
+                ],
+            )
+            with open(
+                Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
+            ) as file:
+                data = json.loads(file.read())
+
+            configs = json.load(open(Path(args.data_path, model, "results.json")))[
+                "configs"
+            ]
+            config = configs[task]
+
+            if model_index == 0:  # Only need to assemble data for the first model
+                metrics = []
+                for metric in config["metric_list"]:
+                    metrics.append(
+                        ZenoMetric(
+                            name=metric["metric"],
+                            type="mean",
+                            columns=[metric["metric"]],
+                        )
+                    )
+                project = client.create_project(
+                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
+                    view="text-classification",
+                    metrics=metrics,
+                )
+                project.upload_dataset(
+                    generate_dataset(data, config),
+                    id_column="id",
+                    data_column="data",
+                    label_column="labels",
+                )
+
+            project.upload_system(
+                generate_system_df(data, config),
+                name=model,
+                id_column="id",
+                output_column="output",
+            )
+
+
+def tasks_for_model(model: str, data_path: str):
+    """Get the tasks for a specific model.
+
+    Args:
+        model (str): The name of the model.
+        data_path (str): The path to the data.
+
+    Returns:
+        list: A list of tasks for the model.
+    """
+    dir_path = Path(data_path, model)
+    config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
+    return list(config[0].keys())
+
+
+def generate_dataset(
+    data,
+    config,
+):
+    """Generate a Zeno dataset from evaluation data.
+
+    Args:
+        data: The data to generate a dataset for.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
+    """
+    ids = [x["doc_id"] for x in data]
+    labels = [x["target"] for x in data]
+    instance = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        instance = [x["arguments"][0][0] for x in data]
+        labels = [x["arguments"][0][1] for x in data]
+    elif config["output_type"] == "multiple_choice":
+        instance = [
+            x["arguments"][0][0]
+            + "\n\n"
+            + "\n".join([f"- {y[1]}" for y in x["arguments"]])
+            for x in data
+        ]
+    elif config["output_type"] == "loglikelihood_rolling":
+        instance = [x["arguments"][0][0] for x in data]
+    elif config["output_type"] == "generate_until":
+        instance = [x["arguments"][0][0] for x in data]
+
+    return pd.DataFrame(
+        {
+            "id": ids,
+            "data": instance,
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+    )
+
+
+def generate_system_df(data, config):
+    """Generate a dataframe for a specific system to be uploaded to Zeno.
+
+    Args:
+        data: The data to generate a dataframe from.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
+    """
+    ids = [x["doc_id"] for x in data]
+    system_dict = {"id": ids}
+    system_dict["output"] = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        system_dict["output"] = [
+            "correct" if x["filtered_resps"][0][1] is True else "incorrect"
+            for x in data
+        ]
+    elif config["output_type"] == "multiple_choice":
+        system_dict["output"] = [
+            ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
+        ]
+        system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
+    elif config["output_type"] == "loglikelihood_rolling":
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
+    elif config["output_type"] == "generate_until":
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
+        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
+
+    metrics = {}
+    for metric in config["metric_list"]:
+        if "aggregation" in metric and metric["aggregation"] == "mean":
+            metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
+
+    system_dict.update(metrics)
+    system_df = pd.DataFrame(system_dict)
+    return system_df
+
+
+if __name__ == "__main__":
+    main()
--- a/setup.py
+++ b/setup.py
 import setuptools

+
 # This is to make sure that the package supports editable installs
 setuptools.setup()
--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
-import unittest
-from unittest.mock import patch
 import hashlib
 import json
 import os
 import pickle
-from lm_eval.models.gguf import GGUFLM
+import unittest
+from unittest.mock import patch

 from lm_eval.api.instance import Instance
+from lm_eval.models.gguf import GGUFLM
+

 base_url = "https://matthoffner-ggml-llm-api.hf.space"


--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
 from __future__ import annotations
-import pytest
+
+import sys
 from pathlib import Path
+
 import numpy as np
-from lm_eval.models.huggingface import HFLM
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
 import torch

+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+from lm_eval.models.huggingface import HFLM
+
+
 tasks.initialize_tasks()


@@ -106,9 +109,10 @@ class Test_HFLM:
            f.write("\n".join(str(x) for x in _res))
        assert np.allclose(_res, _RES, atol=1e-2)
        # check indices for Multiple Choice
-        argmax_RES, argmax_res = np.argmax(
-            np.array(_RES).reshape(-1, 4), axis=1
-        ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
+        argmax_RES, argmax_res = (
+            np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
+            np.argmax(np.array(_res).reshape(-1, 4), axis=1),
+        )
        assert (argmax_RES == argmax_res).all()

    def test_generate_until(self) -> None:

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
-import pytest
 from typing import List
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
+
+import pytest
 import torch

+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+

 @pytest.mark.skip(reason="requires CUDA")
 class TEST_VLLM:

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
-import os
-
 # import lm_eval.base as base
-import lm_eval.api.registry as registry
-import lm_eval.tasks as tasks
+from typing import List
+
+import pytest

 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
-from typing import List
-import random
-import pytest
+import lm_eval.tasks as tasks
+

 tasks.initialize_tasks()


--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
-import re
 from collections import defaultdict

 from lm_eval.decontamination.janitor import (
    Janitor,
    form_ngrams,
-    word_ngrams,
    split_indices,
+    word_ngrams,
    word_ngrams_indices,
 )

@@ -81,7 +80,6 @@ def test_split_indices():


 def test_word_ngrams_indices():
-
    sequence = (
        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window

+
 # All tests below initially test without any registered contaminants, expecting the same sequence back.
 def test_janitor1():
-
    # First test using a 1gram and expected the first block before the filth to have some remaining
    # characters, but the second block should be completely removed.

@@ -165,7 +163,6 @@ def test_janitor1():


 def test_janitor2():
-
    # Second test using a 1gram and expected the first block before the filth to have some remaining
    # characters, and the second block is longer then 200 characters so should also have some remaining.

@@ -214,7 +211,6 @@ def test_janitor2():


 def test_janitor3():
-
    # Same test as above but with a 6gram.

    sequence = (
@@ -262,7 +258,6 @@ def test_janitor3():


 def test_janitor4():
-
    # This test adds another block to that from the previous. The middle block should be entirely
    # removed as the 200 characters are removed from each side.

@@ -318,7 +313,6 @@ def test_janitor4():


 def test_janitor5():
-
    # Same as above but using multiple different filth 6grams.

    sequence = (
@@ -374,7 +368,6 @@ def test_janitor5():


 def test_janitor6():
-
    # Same as above but now we add 10 filths and expect the same result, the following test does 11.

    sequence = (
@@ -438,7 +431,6 @@ def test_janitor6():


 def test_janitor7():
-
    # Same as above but now we add 9 filths and expect the same result, the following test does 10.

    sequence = (

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
+import random
+
 import pytest
+
 import lm_eval.api.metrics as metrics
-import random


 def test_bootstrapping():

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 from itertools import islice
+
 import pytest
-from .utils import new_tasks
+
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask

+from .utils import new_tasks
+
+
 tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]
@@ -26,7 +30,7 @@ def limit() -> int:


 # Tests
-@pytest.mark.parametrize("task_class", task_class())
+@pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
 class TestNewTasks:
    def test_download(self, task_class: ConfigurableTask):
        task_class.download()

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
-from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
+import pytest
+
+from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window


 # noinspection DuplicatedCode
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
    )
    assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
    assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
+
+
+class TestCollator:
+    def make_generate_sample(self, end=10):
+        strings = ["x" * i for i in range(1, end + 1)]
+        gen_kwargs1, gen_kwargs2 = (
+            {"temperature": 0},
+            {"temperature": 0, "until": ["nn", "\n\n"]},
+        )
+        args = [
+            (string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
+            for i, string in enumerate(strings)
+        ]
+
+        return args
+
+    def make_loglikelihood_sample(self, end=11):
+        samples = [
+            (("x", "x"), list(range(1, total_length + 1)))
+            for total_length in range(1, end + 1)
+        ]
+        return samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
+    def test_generations(self, batch_size, end):
+        _collate_gen = lambda x: (-len(x[0]), x[0])  # noqa: E731
+
+        generation_samples = self.make_generate_sample(int(end))
+        gens = Collator(generation_samples, _collate_gen, grouping=True)
+        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            group_one = end // 2
+            group_two = end - end // 2
+            assert (
+                len(chunks) <= batch_size
+                if batch_size != 0
+                else len(chunks) in [group_one, group_two]
+            )
+            # check if reorder-er is working correctly
+            assert all(
+                len(chunks[i][0]) <= len(chunks[i - 1][0])
+                for i in range(1, len(chunks))
+            )
+            # check if grouping correctly
+            assert all(x[1] == chunks[0][1] for x in chunks)
+            for x in chunks:
+                output.append(x)
+        reordered_output = gens.get_original(output)
+        # check get original
+        assert reordered_output == generation_samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
+    def test_loglikelihood(self, batch_size, end):
+        _collate_log = lambda x: (-len(x[1]), tuple(x[1]))  # noqa: E731
+        loglikelihood_samples = self.make_loglikelihood_sample(int(end))
+        loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
+        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
+            # check reorder
+            assert all(
+                len(chunks[i][1]) <= len(chunks[i - 1][1])
+                for i in range(1, len(chunks))
+            )
+            for x in chunks:
+                output.append(x[1])
+        # check indices
+        reordered_output = loglikelihoods.get_original(output)
+        assert reordered_output == [x[1] for x in loglikelihood_samples]
--- a/tests/tests_master/test_description.py
+++ b/tests/tests_master/test_description.py
-import random
-import lm_eval.tasks
-import lm_eval.models
-
-
-def test_description():
-    seed = 42
-    num_examples = 1
-    task_names = ["arc_challenge", "arc_easy"]
-    description_dict = {
-        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
-        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
-    }
-
-    task_dict = lm_eval.tasks.get_task_dict(task_names)
-    for task_name, task in task_dict.items():
-
-        # patch description field in task (# TODO: make this much more cleaned up)
-        task._config.description = description_dict[task_name]
-
-        rnd = random.Random()
-        rnd.seed(seed)
-
-        if task.has_training_docs():
-            docs = task.training_docs()
-        elif set == "val" and task.has_validation_docs():
-            docs = task.validation_docs()
-        elif set == "test" and task.has_test_docs():
-            docs = task.test_docs()
-
-        description = (
-            description_dict[task_name]
-            if description_dict and task_name in description_dict
-            else ""
-        )
-
-        for _, doc in (
-            zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
-        ):
-            ctx = task.fewshot_context(
-                doc=doc,
-                num_fewshot=1,
-            )
-            assert description in ctx