Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into superglue

f71d56eb · lintangsutawika · 33f2f9bf · 2f870265 · f71d56eb · f71d56eb
Commit f71d56eb authored Aug 21, 2023 by lintangsutawika
Showing with 159 additions and 22 deletions

setup.py setup.py +24 -18

templates/new_yaml_task/README.md templates/new_yaml_task/README.md +10 -4

tests/models/test_huggingface.py tests/models/test_huggingface.py +125 -0

No files found.
--- a/setup.py
+++ b/setup.py
 import setuptools
+import itertools
 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()
+extras_require = {
+    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
+    "linting": [
+        "flake8",
+        "pylint",
+        "mypy",
+        "pre-commit",
+    ],
+    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
+    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
+    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
+    "promptsource": [
+        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+    ],
+    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
+    "anthropic": ["anthropic"],
+    "openai": ["openai", "tiktoken"],
+}
+extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
 setuptools.setup(
    name="lm_eval",
    version="1.0.0",
@@ -50,22 +73,5 @@ setuptools.setup(
        "transformers>=4.1",
        "zstandard",
    ],
-    extras_require={
+    extras_require=extras_require,
-        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-        "linting": [
-            "flake8",
-            "pylint",
-            "mypy",
-            "pre-commit",
-        ],
-        "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
-        "promptsource": [
-            "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-        ],
-        "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-        "anthropic": ["anthropic"],
-        "openai": ["openai", "tiktoken"],
-    },
 )
--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
@@ -2,7 +2,8 @@
 ### Paper
-Title: `paper title goes here`
+Title: `paper titles goes here`
 Abstract: `link to paper PDF or arXiv abstract goes here`
 `Short description of paper / benchmark goes here:`
@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
 BibTeX-formatted citation goes here
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* `group_name`: `Short description`
+#### Tasks
-List or describe tasks defined in this folder, and their names here:
 * `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+* `task_name2`: ...
 ### Checklist

--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
+from __future__ import annotations
+import pytest
+import numpy as np
+from lm_eval.models.huggingface import HFLM
+from lm_eval.api.instance import Instance
+import lm_eval.tasks as tasks
+class Test_HFLM:
+    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
+    greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
+    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: list[Instance] = rolling_task.instances
+    MULTIPLE_CH_RES = [
+        -41.902435302734375,
+        -42.939308166503906,
+        -33.914180755615234,
+        -37.07139205932617,
+        -22.95258331298828,
+        -20.342208862304688,
+        -14.818366050720215,
+        -27.942853927612305,
+        -15.80704116821289,
+        -15.936427116394043,
+        -13.052018165588379,
+        -18.04828453063965,
+        -13.345029830932617,
+        -13.366025924682617,
+        -12.127134323120117,
+        -11.872495651245117,
+        -47.10598373413086,
+        -47.76410675048828,
+        -36.4406852722168,
+        -50.0289421081543,
+        -16.72093963623047,
+        -18.535587310791016,
+        -26.46993637084961,
+        -20.355995178222656,
+        -17.757919311523438,
+        -21.80595588684082,
+        -33.1990852355957,
+        -39.28636932373047,
+        -14.759679794311523,
+        -16.753942489624023,
+        -11.486852645874023,
+        -15.42177677154541,
+        -13.15798282623291,
+        -15.887393951416016,
+        -15.28614616394043,
+        -12.339089393615723,
+        -44.59441375732422,
+        -55.40888214111328,
+        -52.70050811767578,
+        -56.25089645385742,
+    ]
+    GREEDY_UNTIL_RES = [
+        " The average of $2.50 each is $",
+        " A robe takes 2 bolts of blue fiber and half",
+        " $50,000 in repairs.",
+        " He runs 1 sprint 3 times a week.",
+        " They feed each of her chickens three cups of mixed",
+        " The price of the glasses is $5, but",
+        " The total percentage of students who said they like to",
+        " Carla is downloading a 200 GB file. Normally",
+        " John drives for 3 hours at a speed of 60",
+        " Eliza sells 4 tickets to 5 friends so she",
+    ]
+    ROLLING_RES = [
+        -3603.6328125,
+        -19779.23974609375,
+        -8834.16455078125,
+        -27967.591796875,
+        -7636.794982910156,
+        -9491.93505859375,
+        -41043.4248046875,
+        -8397.689819335938,
+        -45969.47155761719,
+        -7158.90625,
+    ]
+    LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
+        # change atol in case of consistent failure
+        assert np.allclose(_res, _RES, atol=1e-4)
+        # check indices for Multiple Choice
+        argmax_RES, argmax_res = np.argmax(
+            np.array(_RES).reshape(-1, 4), axis=1
+        ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
+        assert (argmax_RES == argmax_res).all()
+    def test_greedy_until(self) -> None:
+        res = self.LM.greedy_until(self.GREEDY_UNTIL)
+        assert res == self.GREEDY_UNTIL_RES
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
+    def test_toc_encode(self) -> None:
+        res = self.LM.tok_encode("foo bar")
+        assert res == [12110, 2534]
+    def test_toc_decode(self) -> None:
+        res = self.LM.tok_decode([12110, 2534])
+        assert res == "foo bar"
+    def test_batch_encode(self) -> None:
+        res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
+        assert res == [[12110, 2534], [2009, 17374]]
+    def test_model_generate(self) -> None:
+        context = self.LM.tok_batch_encode(["foo bar"])[0]
+        res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
+        res = self.LM.tok_decode(res[0])
+        assert res == "foo bar\n<bazhang>!info bar"