Commit f71d56eb authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into superglue

parents 33f2f9bf 2f870265
import setuptools import setuptools
import itertools
with open("README.md", "r", encoding="utf-8") as fh: with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read() long_description = fh.read()
extras_require = {
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
}
extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
setuptools.setup( setuptools.setup(
name="lm_eval", name="lm_eval",
version="1.0.0", version="1.0.0",
...@@ -50,22 +73,5 @@ setuptools.setup( ...@@ -50,22 +73,5 @@ setuptools.setup(
"transformers>=4.1", "transformers>=4.1",
"zstandard", "zstandard",
], ],
extras_require={ extras_require=extras_require,
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
},
) )
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
### Paper ### Paper
Title: `paper title goes here` Title: `paper titles goes here`
Abstract: `link to paper PDF or arXiv abstract goes here` Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:` `Short description of paper / benchmark goes here:`
...@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable` ...@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here BibTeX-formatted citation goes here
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* `group_name`: `Short description`
#### Tasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does` * `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: ..... * `task_name2`: ...
### Checklist ### Checklist
......
from __future__ import annotations
import pytest
import numpy as np
from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
class Test_HFLM:
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")() # type: ignore
greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances
MULTIPLE_CH_RES = [
-41.902435302734375,
-42.939308166503906,
-33.914180755615234,
-37.07139205932617,
-22.95258331298828,
-20.342208862304688,
-14.818366050720215,
-27.942853927612305,
-15.80704116821289,
-15.936427116394043,
-13.052018165588379,
-18.04828453063965,
-13.345029830932617,
-13.366025924682617,
-12.127134323120117,
-11.872495651245117,
-47.10598373413086,
-47.76410675048828,
-36.4406852722168,
-50.0289421081543,
-16.72093963623047,
-18.535587310791016,
-26.46993637084961,
-20.355995178222656,
-17.757919311523438,
-21.80595588684082,
-33.1990852355957,
-39.28636932373047,
-14.759679794311523,
-16.753942489624023,
-11.486852645874023,
-15.42177677154541,
-13.15798282623291,
-15.887393951416016,
-15.28614616394043,
-12.339089393615723,
-44.59441375732422,
-55.40888214111328,
-52.70050811767578,
-56.25089645385742,
]
GREEDY_UNTIL_RES = [
" The average of $2.50 each is $",
" A robe takes 2 bolts of blue fiber and half",
" $50,000 in repairs.",
" He runs 1 sprint 3 times a week.",
" They feed each of her chickens three cups of mixed",
" The price of the glasses is $5, but",
" The total percentage of students who said they like to",
" Carla is downloading a 200 GB file. Normally",
" John drives for 3 hours at a speed of 60",
" Eliza sells 4 tickets to 5 friends so she",
]
ROLLING_RES = [
-3603.6328125,
-19779.23974609375,
-8834.16455078125,
-27967.591796875,
-7636.794982910156,
-9491.93505859375,
-41043.4248046875,
-8397.689819335938,
-45969.47155761719,
-7158.90625,
]
LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
# change atol in case of consistent failure
assert np.allclose(_res, _RES, atol=1e-4)
# check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1
), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
assert (argmax_RES == argmax_res).all()
def test_greedy_until(self) -> None:
res = self.LM.greedy_until(self.GREEDY_UNTIL)
assert res == self.GREEDY_UNTIL_RES
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
def test_toc_encode(self) -> None:
res = self.LM.tok_encode("foo bar")
assert res == [12110, 2534]
def test_toc_decode(self) -> None:
res = self.LM.tok_decode([12110, 2534])
assert res == "foo bar"
def test_batch_encode(self) -> None:
res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
assert res == [[12110, 2534], [2009, 17374]]
def test_model_generate(self) -> None:
context = self.LM.tok_batch_encode(["foo bar"])[0]
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
res = self.LM.tok_decode(res[0])
assert res == "foo bar\n<bazhang>!info bar"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment