merged main

90ad5db7 · lintangsutawika · f692caa9 · b177c82c · 90ad5db7 · 90ad5db7
Commit 90ad5db7 authored Mar 01, 2024 by lintangsutawika
4 changed files
--- a/scripts/requests_caching.py
+++ b/scripts/requests_caching.py
+"""
+Usage:
+   python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
+"""
+
+import argparse
+import os
+from typing import List
+
+import torch
+from transformers import (
+    pipeline as trans_pipeline,
+)
+
+from lm_eval import simple_evaluate
+from lm_eval.evaluator import request_caching_arg_to_dict
+from lm_eval.utils import eval_logger
+
+
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# Used to specify alternate cache path, useful if run in a docker container
+# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
+LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+MODEL = "EleutherAI/pythia-70m"
+
+TASK = "text-generation"
+
+
+def run_model_for_task_caching(tasks: List[str], cache_requests: str):
+    eval_logger.info(f"Loading HF model: {MODEL}")
+
+    trans_pipe = trans_pipeline(
+        task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True
+    )
+
+    model = trans_pipe.model
+    tokenizer = trans_pipe.tokenizer
+
+    eval_logger.info(
+        f"Running simple_evaluate to cache request objects for tasks: {tasks}"
+    )
+
+    cache_args = request_caching_arg_to_dict(cache_requests=cache_requests)
+
+    eval_logger.info(
+        f"The following operations will be performed on the cache: {cache_requests}"
+    )
+
+    eval_data = simple_evaluate(
+        model="hf-auto",
+        model_args={
+            "pretrained": model,
+            "tokenizer": tokenizer,
+        },
+        limit=1,
+        device=DEVICE,
+        tasks=tasks,
+        write_out=True,
+        **cache_args,
+    )
+
+    return eval_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        metavar="task1,task2",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+
+    args = parser.parse_args()
+
+    tasks = args.tasks.split(",")
+
+    eval_data = run_model_for_task_caching(
+        tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests
+    )
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -22,8 +22,8 @@ class Test_HFLM:
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
    generate_until_task = task_list["gsm8k"]  # type: ignore
-    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until: list[Instance] = generate_until_task.instances
    rolling_task = task_list["wikitext"]  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
@@ -74,7 +74,7 @@ class Test_HFLM:
    generate_until_RES = [
        " The average of $2.50 each is $",
        " A robe takes 2 bolts of blue fiber and half",
-        " $50,000 in repairs.",
+        " $50,000 in repairs.\n\nQuestion",
        " He runs 1 sprint 3 times a week.",
        " They feed each of her chickens three cups of mixed",
        " The price of the glasses is $5, but",

--- a/tests/test_requests_caching.py
+++ b/tests/test_requests_caching.py
+# import lm_eval.base as base
+import importlib
+import os
+import sys
+from datetime import datetime
+from typing import List, Tuple
+
+import pytest
+import torch
+
+# import lm_eval.models as models
+from lm_eval.caching.cache import PATH
+
+
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# NOTE the script this loads uses simple evaluate
+# TODO potentially test both the helper script and the normal script
+sys.path.append(f"{MODULE_DIR}/../scripts")
+model_loader = importlib.import_module("requests_caching")
+run_model_for_task_caching = model_loader.run_model_for_task_caching
+
+
+DEFAULT_TASKS = ["lambada_openai", "hellaswag"]
+
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown():
+    # Setup
+    torch.use_deterministic_algorithms(False)
+    clear_cache()
+    # Yields control back to the test function
+    yield
+    # Cleanup here
+
+
+def clear_cache():
+    if os.path.exists(PATH):
+        cache_files = os.listdir(PATH)
+        for file in cache_files:
+            file_path = f"{PATH}/{file}"
+            os.unlink(file_path)
+
+
+# leaving tasks here to allow for the option to select specific task files
+def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
+    cache_files = os.listdir(PATH)
+
+    file_task_names = []
+
+    for file in cache_files:
+        file_without_prefix = file.split("-")[1]
+        file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
+        file_task_names.append(file_without_prefix_and_suffix)
+
+    return cache_files, file_task_names
+
+
+def assert_created(tasks: List[str], file_task_names: List[str]):
+    tasks.sort()
+    file_task_names.sort()
+
+    assert tasks == file_task_names
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_true(tasks: List[str]):
+    run_model_for_task_caching(tasks=tasks, cache_requests="true")
+
+    cache_files, file_task_names = get_cache_files()
+
+    assert_created(tasks=tasks, file_task_names=file_task_names)
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_refresh(tasks: List[str]):
+    run_model_for_task_caching(tasks=tasks, cache_requests="true")
+
+    timestamp_before_test = datetime.now().timestamp()
+
+    run_model_for_task_caching(tasks=tasks, cache_requests="refresh")
+
+    cache_files, file_task_names = get_cache_files()
+
+    for file in cache_files:
+        modification_time = os.path.getmtime(f"{PATH}/{file}")
+        assert modification_time > timestamp_before_test
+
+    tasks.sort()
+    file_task_names.sort()
+
+    assert tasks == file_task_names
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_delete(tasks: List[str]):
+    # populate the data first, rerun this test within this test for additional confidence
+    test_requests_caching_true(tasks=tasks)
+
+    run_model_for_task_caching(tasks=tasks, cache_requests="delete")
+
+    cache_files, file_task_names = get_cache_files()
+
+    assert len(cache_files) == 0
+
+
+# useful for locally running tests through the debugger
+if __name__ == "__main__":
+
+    def run_tests():
+        tests = [
+            test_requests_caching_true,
+            test_requests_caching_refresh,
+            test_requests_caching_delete,
+        ]
+
+        for test_func in tests:
+            clear_cache()
+            test_func(tasks=DEFAULT_TASKS)
+
+        print("Tests pass")
+
+    run_tests()
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,6 +2,7 @@ import itertools

 import numpy as np
 import pytest
+import torch

 from lm_eval.api.metrics import (
    aggregate_subtask_metrics,
@@ -258,12 +259,20 @@ class TestCollator:
        ]
        return samples

+    def make_loglikelihood_sample_group(self, end=11):
+        a = [(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x]) for x in range(9)]
+        b = [
+            (("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x, y, z])
+            for x, y, z in zip(range(9), range(9, 18), range(18, 27))
+        ]
+        return a + b
+
    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
    def test_generations(self, batch_size, end):
        _collate_gen = lambda x: (-len(x[0]), x[0])  # noqa: E731

        generation_samples = self.make_generate_sample(int(end))
-        gens = Collator(generation_samples, _collate_gen, grouping=True)
+        gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
        output = []
        for chunks in chunks:
@@ -292,7 +301,10 @@ class TestCollator:
    def test_loglikelihood(self, batch_size, end):
        _collate_log = lambda x: (-len(x[1]), tuple(x[1]))  # noqa: E731
        loglikelihood_samples = self.make_loglikelihood_sample(int(end))
-        loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
+        loglikelihoods = Collator(
+            loglikelihood_samples,
+            _collate_log,
+        )
        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
        output = []
        for chunks in chunks:
@@ -309,6 +321,48 @@ class TestCollator:
        reordered_output = loglikelihoods.get_original(output)
        assert reordered_output == [x[1] for x in loglikelihood_samples]

+    @pytest.mark.parametrize("batch_size", [17, 8, 12, 0])
+    def test_context_grouping(self, batch_size):
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        _collate_log = _collate  # noqa: E731
+        loglikelihood_samples = self.make_loglikelihood_sample_group()
+        loglikelihoods = Collator(
+            loglikelihood_samples,
+            _collate_log,
+            group_fn=lambda a: a[-2] + a[-1][:-1],
+            group_by="contexts",
+        )
+        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        outputs_ = []
+        for chunks in chunks:
+            # check batching
+            if batch_size != 0:
+                assert len(chunks) <= batch_size
+            # check reorder
+            assert all(
+                len(chunks[i][1]) <= len(chunks[i - 1][1])
+                for i in range(1, len(chunks))
+            )
+            for x in chunks:
+                for request_str, cont_toks, logits in loglikelihoods.get_cache(
+                    req_str="".join(x[0]),
+                    cxt_toks=x[1],
+                    cont_toks=x[2],
+                    logits=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+                    .unsqueeze(0)
+                    .unsqueeze(0),
+                ):
+                    output.append(x[1])
+                    outputs_.append(cont_toks)
+        assert len(output) == len(outputs_)
+        # check indices
+        reordered_output = loglikelihoods.get_original(output)
+        assert reordered_output == [x[1] for x in loglikelihood_samples]
+

 def test_aggregate_mean():
    # test weight_by_size is respected