Call "exact_match" once for each multiple-target sample (#1266)

* Refine scoring logic for multiple_target "exact_match" metric * skip old tests from master * skip old tests from master * delete tests from master

Call "exact_match" once for each multiple-target sample (#1266)
* Refine scoring logic for multiple_target "exact_match" metric * skip old tests from master * skip old tests from master * delete tests from master
692e0f83 · Baber Abbasi · GitHub · 9b0b15b1 · 692e0f83 · 692e0f83
Unverified Commit 692e0f83 authored Jan 10, 2024 by Baber Abbasi Committed by GitHub Jan 10, 2024
6 changed files
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -61,7 +61,7 @@ jobs:
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+      run: python -m pytest --showlocals -s -vv -n=auto
    - name: Archive artifacts
      uses: actions/upload-artifact@v3
      with:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1131,27 +1131,36 @@ class ConfigurableTask(Task):
                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
                        # print(gold)
                        gold = [gold]
-                    for gold_option in gold:
+                    if metric == "exact_match":
-                        try:
+                        result = [result for _ in range(len(gold))]
-                            result_score = self._metric_fn_list[metric](
+                        scores = self._metric_fn_list[metric](
-                                references=[gold_option],
+                            references=gold,
-                                predictions=[result],
+                            predictions=result,
-                                **self._metric_fn_kwargs[metric],
+                            **self._metric_fn_kwargs[metric],
-                            )
+                        )[metric]
-                        except (
+                        result_score = 1.0 if scores > 0.0 else 0.0
-                            TypeError
-                        ):  # TODO: this is hacky and I don't want to do it
-                            result_score = self._metric_fn_list[metric](
-                                [gold_option, result]
-                            )
-                        if isinstance(result_score, dict):
-                            # TODO: this handles the case where HF evaluate returns a dict.
-                            result_score = result_score[metric]
-                        scores.append(result_score)
-                    if any(scores):
-                        result_score = 1.0
                    else:
-                        result_score = 0.0
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
                else:
                    try:
                        result_score = self._metric_fn_list[metric](

--- a/tests/tests_master/test_description.py
+++ b/tests/tests_master/test_description.py
-import random
-import lm_eval.models
-import lm_eval.tasks
-def test_description():
-    seed = 42
-    num_examples = 1
-    task_names = ["arc_challenge", "arc_easy"]
-    description_dict = {
-        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
-        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
-    }
-    task_dict = lm_eval.tasks.get_task_dict(task_names)
-    for task_name, task in task_dict.items():
-        # patch description field in task (# TODO: make this much more cleaned up)
-        task._config.description = description_dict[task_name]
-        rnd = random.Random()
-        rnd.seed(seed)
-        if task.has_training_docs():
-            docs = task.training_docs()
-        elif set == "val" and task.has_validation_docs():
-            docs = task.validation_docs()
-        elif set == "test" and task.has_test_docs():
-            docs = task.test_docs()
-        description = (
-            description_dict[task_name]
-            if description_dict and task_name in description_dict
-            else ""
-        )
-        for _, doc in (
-            zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
-        ):
-            ctx = task.fewshot_context(
-                doc=doc,
-                num_fewshot=1,
-            )
-            assert description in ctx
--- a/tests/tests_master/test_generate_13_grams.py
+++ b/tests/tests_master/test_generate_13_grams.py
-import glob
-import logging
-import os
-import shutil
-from collections import Counter
-from lm_eval.decontamination.archiver import Archive, TextReader
-from lm_eval.decontamination.janitor import Janitor, word_ngrams
-from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
-logger = logging.getLogger(__name__)
-def test_generate_13_grams_1(caplog):
-    data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
-    This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
-    Some other birds, mostly related to the shelducks, have "goose" as part of their names.
-    More distantly related members of the family Anatidae are swans, most of which are larger
-    than true geese, and ducks, which are smaller. The term "goose" may refer to either a male
-    or female bird, but when paired with "gander", refers specifically to a female one (the latter referring
-    to a male). Young birds before fledging are called goslings. The collective noun for a group of
-    geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when
-    flying close together, they are called a plump."""
-    data = data + data
-    # Simple Generation
-    print("simple generation")
-    n = 13
-    janitor = Janitor()
-    ngrams = word_ngrams(janitor.normalize_string(data), n)
-    comparison = list(ngrams)
-    comparison_counter = Counter(comparison)
-    print(len(comparison))
-    # print(comparison)
-    # Generating into buckets
-    print("bucket generation")
-    test_working_directory = "test_generate_13_grams"
-    try:
-        shutil.rmtree(test_working_directory)
-    except FileNotFoundError:
-        pass
-    os.makedirs(test_working_directory)
-    assert not os.path.exists("../pile")
-    os.makedirs("../pile")
-    archive = Archive(os.path.join("../pile", "test.jsonl.zst"))
-    archive.add_data(data)
-    archive.commit()
-    bucket_count = 4
-    do_ngrams_in_buckets(n, test_working_directory, bucket_count)
-    # Rebuild from buckets
-    print("rebuild")
-    rebuilt_ngrams = []
-    bucket_file_paths = glob.glob(
-        os.path.join(test_working_directory, "output", "*.bkt.txt")
-    )
-    for bucket_file_path in bucket_file_paths:
-        reader = TextReader(bucket_file_path)
-        for line in reader.read():
-            [ngram, document_id] = line.rsplit(" ", 1)
-            rebuilt_ngrams.append(ngram)
-    # Compare
-    print("compare")
-    result_counter = Counter(rebuilt_ngrams)
-    # print(len(result_counter))
-    # print(len(comparison_counter))
-    assert len(result_counter) == len(comparison_counter)
-    # print(result_counter)
-    # print(comparison_counter)
-    assert comparison_counter == result_counter
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
-import hashlib
-import json
-import os
-import pickle
-import unittest.mock as mock
-import pytest
-from openai import OpenAI
-import lm_eval.models as models
-client = OpenAI()
-LOGLIKELIHOOD_TEST_CASES = [
-    ("The quick brown fox jumps over the lazy", " dog"),
-    ("The quick brown fox jumps over the lazy", " cat"),
-    ("The quick brown fox jumps over the lazy", ", lazy dog"),
-    ("The quick brown fox jumps over the lazy", ", lazy fox"),
-    (
-        "The quick brown fox jumps over the lazy",
-        ", lazy fox and they both fall to the ground",
-    ),
-    (
-        """A mult""",
-        """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-    ),
-    (
-        """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
-        """ (with threshold activation); see § Terminology""",
-    ),
-    (
-        """Multilayer perceptrons are sometimes coll""",
-        """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
-    ),
-    (
-        """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
-        """ activation function.""",
-    ),
-    (
-        """MLP utilizes a supervised""",
-        """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
-    ),
-    (
-        """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
-        """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
-    ),
-    (
-        """Specifically, we train GPT-3, an autoregressive language model with 175""",
-        """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
-    ),
-    (
-        """A mult""",
-        """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-    ),
-    ("""Hello""", """ World"""),
-]
-# Test HuggingFace Models (GPT-2)
-def test_gpt2():
-    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
-    (
-        (ll_dog, ig_dog),
-        (ll_cat, ig_cat),
-        (_, ll_max_0),
-        (_, ll_max_1),
-        (_, ll_max_2),
-        *vals,
-    ) = gpt2.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
-    assert ll_dog > ll_cat
-    assert not ig_cat
-    assert not ll_max_0
-    assert ll_max_1
-    assert ll_max_2
-    # test empty context
-    gpt2.loglikelihood([("", "test")])
-    (gen,) = gpt2.generate_until(
-        [("The quick brown fox jumps over the lazy", [".", "\n"])]
-    )
-    assert gen == ", lazy fox and they both fall to the ground"
-    targets = [
-        -61.60536193847656,
-        -56.57843780517578,
-        -62.131004333496094,
-        -9.799489974975586,
-        -153.96334838867188,
-        -341.222900390625,
-        -731.1475830078125,
-        -61.60536193847656,
-        -8.682319641113281,
-    ]
-    for (pred, _), tgt in zip(vals, targets):
-        assert pred == pytest.approx(tgt, rel=1e-3)
-def test_gpt2_perplexity():
-    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
-    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
-    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum(
-        [
-            -4.9599953,
-            -8.069298,
-            -8.308624,
-            -10.178513,
-            -8.906924,
-            -1.9318912,
-            -7.745445,
-            -7.146077,
-            -5.2072,
-            -3.5882986,
-            -1.9957212,
-            -8.044922,
-            -0.20841774,
-            -5.1096807,
-            -0.099879116,
-            -8.888423,
-            -4.6180487,
-        ]
-    )
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
-    with mock.patch.object(
-        models.gpt2.HFLM, "max_length", new_callable=mock.PropertyMock
-    ) as mock_max_length:
-        mock_max_length.return_value = 5
-        gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
-        perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum(
-        [
-            -4.96001,
-            -8.069275,
-            -8.308612,
-            -10.178482,
-            -8.90691,
-            -4.037338,
-            -8.09261,
-            -11.662385,
-            -10.206891,
-            -4.425003,
-            -2.2563353,
-            -7.909143,
-            -1.9304147,
-            -7.3610134,
-            -2.3120654,
-            -7.3229,
-            -2.1643813,
-        ]
-    )
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
-# Test OpenAI Models (GPT-3)
-def openai_mock_completion(**kwargs):
-    # Mock completion function
-    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
-    os.makedirs("tests/testdata", exist_ok=True)
-    hash = hashlib.sha256(
-        json.dumps(kwargs, sort_keys=True).encode("utf-8")
-    ).hexdigest()
-    fname = f"tests/testdata/gpt3_test_{hash}.pkl"
-    if os.path.exists(fname):
-        with open(fname, "rb") as fh:
-            return pickle.load(fh)
-    ret = client.completions.create(**kwargs)
-    ret.api_key = ""
-    with open(fname, "wb") as fh:
-        pickle.dump(ret, fh)
-    return ret
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
-def test_gpt3():
-    if "OPENAI_API_SECRET_KEY" not in os.environ:
-        os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-    (
-        (ll_dog, ig_dog),
-        (ll_cat, ig_cat),
-        (_, ll_max_0),
-        (_, ll_max_1),
-        (_, ll_max_2),
-        *vals,
-    ) = gpt3.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
-    assert ll_dog > ll_cat
-    assert not ig_cat
-    assert ig_dog
-    assert not ll_max_0
-    assert not ll_max_1
-    assert not ll_max_2
-    # test empty context
-    gpt3.loglikelihood([("", "test")])
-    (gen,) = gpt3.generate_until(
-        [("The quick brown fox jumps over the lazy", [".", "\n"])]
-    )
-    assert gen == " dog"
-    print([x[0] for x in vals])
-    targets = [
-        -34.848301606999996,
-        -47.148329679999996,
-        -45.44380149599999,
-        -5.285246016,
-        -133.97821690686004,
-        -321.2616693239001,
-        -658.0299524401041,
-        -34.848301606999996,
-        -7.525115,
-    ]
-    for (pred, _), tgt in zip(vals, targets):
-        assert pred == pytest.approx(tgt, rel=1e-3)
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
-def test_gpt3_perplexity():
-    if "OPENAI_API_SECRET_KEY" not in os.environ:
-        os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
-    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
-    tgt = -84.38819608
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
-    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    with mock.patch.object(
-        models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
-    ) as mock_max_length:
-        mock_max_length.return_value = 5
-        gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
-    tgt = -101.81967209999999
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
-# Test TextSynth Models (GPT-J)
-def textsynth_mock_completion(**kwargs):
-    # Mock completion function
-    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
-    import requests
-    os.makedirs("tests/testdata", exist_ok=True)
-    hash_kwargs = {k: v for k, v in kwargs.items() if k != "headers"}
-    hash = hashlib.sha256(
-        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
-    ).hexdigest()
-    fname = f"tests/testdata/textsynth_test_{hash}.pkl"
-    if os.path.exists(fname):
-        with open(fname, "rb") as fh:
-            return pickle.load(fh)
-    ret = requests.post(**kwargs)
-    with open(fname, "wb") as fh:
-        pickle.dump(ret, fh)
-    return ret
-@mock.patch(
-    "lm_eval.models.textsynth.textsynth_completion", new=textsynth_mock_completion
-)
-def test_textsynth():
-    if "TEXTSYNTH_API_SECRET_KEY" not in os.environ:
-        os.environ["TEXTSYNTH_API_SECRET_KEY"] = ""
-    textsynth = models.get_model("textsynth").create_from_arg_string("engine=gptj_6B")
-    (
-        (ll_dog, ig_dog),
-        (ll_cat, ig_cat),
-        (_, ll_max_0),
-        (_, ll_max_1),
-        (_, ll_max_2),
-        *vals,
-    ) = textsynth.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
-    assert ll_dog > ll_cat
-    assert not ig_cat
-    assert ig_dog
-    assert not ll_max_0
-    assert not ll_max_1
-    assert not ll_max_2
-    # test empty context
-    textsynth.loglikelihood([("", "test")])
-    (gen,) = textsynth.generate_until(
-        [("The quick brown fox jumps over the lazy", [".", "\n"])]
-    )
-    assert gen == " dog"
-    print([x[0] for x in vals])
-    targets = [
-        -17.90513712817,
-        -41.83518912287,
-        -33.82445643841,
-        -2.377361565302,
-        -99.53018069754,
-        -243.5642283598,
-        -528.6862613790,
-        -17.90513712817,
-        -5.041000672142,
-    ]
-    for (pred, _), tgt in zip(vals, targets):
-        assert pred == pytest.approx(tgt, rel=1e-3)
--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
-import collections
-import hashlib
-import json
-import os
-import random
-import pytest
-import lm_eval.evaluator as evaluator
-import lm_eval.models as models
-import lm_eval.tasks as tasks
-os.makedirs("tests/testdata", exist_ok=True)
-def assert_target(name, ob):
-    fname = f"tests/testdata/{name}.json"
-    if os.path.exists(fname):
-        with open(fname) as fh:
-            # Use relative tolerance of 1e-5 and absolute tolerance of 1e-8
-            # assuming most metrics work on `float32` values, which is the common
-            # default floating type across popular libraries (PyTorch, Tensorflow, and JAX).
-            assert flatten(json.load(fh)) == pytest.approx(
-                flatten(json.loads(json.dumps(ob, sort_keys=True))), rel=1e-5, abs=1e-8
-            )
-    else:
-        with open(fname, "w") as fh:
-            json.dump(ob, fh, sort_keys=True)
-def assert_target_hashed(name, ob):
-    fname = f"tests/testdata/{name}"
-    if os.path.exists(fname):
-        with open(fname) as fh:
-            assert (
-                fh.read()
-                == hashlib.sha256(
-                    json.dumps([o.__dict__ for o in ob], sort_keys=True).encode("utf-8")
-                ).hexdigest()
-            )
-    else:
-        with open(fname, "w") as fh:
-            fh.write(
-                hashlib.sha256(
-                    json.dumps([o.__dict__ for o in ob], sort_keys=True).encode("utf-8")
-                ).hexdigest()
-            )
-# from https://stackoverflow.com/a/6027615
-def flatten(d, parent_key="", sep="."):
-    items = []
-    for k, v in d.items():
-        new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-# make sure eval results for a task version are stable
-@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_versions_stable(taskname, task_class):
-    task_dict = tasks.get_task_dict([taskname])
-    lm = models.get_model("dummy")()
-    def ll_fn(reqs):
-        for ctx, cont in [req.args for req in reqs]:
-            if len(ctx) == 0:
-                continue
-            # space convention
-            assert ctx[-1] != " "
-            assert cont[0] == " " or ctx[-1] == "\n"
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
-        res = []
-        random.seed(42)
-        for _ in reqs:
-            res.append((-random.random(), False))
-        return res
-    def ll_perp_fn(reqs):
-        for (string,) in [req.args for req in reqs]:
-            assert isinstance(string, str)
-        assert_target_hashed(
-            f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs
-        )
-        res = []
-        random.seed(42)
-        for _ in reqs:
-            res.append(-random.random())
-        return res
-    def generate_until(reqs):
-        res = []
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-generate_until", reqs)
-        for ctx, _ in [req.args for req in reqs]:
-            res.append("lol")
-            assert ctx.strip() != ""
-        return res
-    lm.loglikelihood = ll_fn
-    lm.loglikelihood_rolling = ll_perp_fn
-    lm.generate_until = generate_until
-    limit = None
-    result = evaluator.evaluate(
-        lm=lm,
-        task_dict=task_dict,
-        num_fewshot=0,
-        limit=limit,
-        bootstrap_iters=10,
-    )
-    assert_target(f"{taskname}-v{task_class.VERSION}-res", result)