Unverified Commit 0cdd730e authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #791 from baberabb/big-refactor_hgtest

[Refactor] Added HF model test
parents 1c5a73c9 51882c1e
......@@ -50,6 +50,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
......
name: Pull Request
on: [pull_request]
jobs:
pre-commit:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: 3.9
- uses: pre-commit/action@v2.0.3
......@@ -6,10 +6,10 @@ name: Unit Tests
on:
push:
branches:
- big-refactor
- 'big-refactor*'
pull_request:
branches:
- big-refactor
- 'big-refactor*'
workflow_dispatch:
# Jobs run concurrently and steps run sequentially within a job.
# jobs: linter and cpu_tests. Add more jobs/steps as required.
......@@ -26,8 +26,11 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Pre-Commit
uses: pre-commit/action@v3.0.0
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
......@@ -52,6 +55,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
......@@ -60,4 +64,4 @@ jobs:
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
run: python -m pytest --showlocals -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
from __future__ import annotations
import pytest
import numpy as np
from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
class Test_HFLM:
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")() # type: ignore
greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances
MULTIPLE_CH_RES = [
-41.902435302734375,
-42.939308166503906,
-33.914180755615234,
-37.07139205932617,
-22.95258331298828,
-20.342208862304688,
-14.818366050720215,
-27.942853927612305,
-15.80704116821289,
-15.936427116394043,
-13.052018165588379,
-18.04828453063965,
-13.345029830932617,
-13.366025924682617,
-12.127134323120117,
-11.872495651245117,
-47.10598373413086,
-47.76410675048828,
-36.4406852722168,
-50.0289421081543,
-16.72093963623047,
-18.535587310791016,
-26.46993637084961,
-20.355995178222656,
-17.757919311523438,
-21.80595588684082,
-33.1990852355957,
-39.28636932373047,
-14.759679794311523,
-16.753942489624023,
-11.486852645874023,
-15.42177677154541,
-13.15798282623291,
-15.887393951416016,
-15.28614616394043,
-12.339089393615723,
-44.59441375732422,
-55.40888214111328,
-52.70050811767578,
-56.25089645385742,
]
GREEDY_UNTIL_RES = [
" The average of $2.50 each is $",
" A robe takes 2 bolts of blue fiber and half",
" $50,000 in repairs.",
" He runs 1 sprint 3 times a week.",
" They feed each of her chickens three cups of mixed",
" The price of the glasses is $5, but",
" The total percentage of students who said they like to",
" Carla is downloading a 200 GB file. Normally",
" John drives for 3 hours at a speed of 60",
" Eliza sells 4 tickets to 5 friends so she",
]
ROLLING_RES = [
-3603.6328125,
-19779.23974609375,
-8834.16455078125,
-27967.591796875,
-7636.794982910156,
-9491.93505859375,
-41043.4248046875,
-8397.689819335938,
-45969.47155761719,
-7158.90625,
]
LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
# change atol in case of consistent failure
assert np.allclose(_res, _RES)
# check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1
), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
assert (argmax_RES == argmax_res).all()
def test_greedy_until(self) -> None:
res = self.LM.greedy_until(self.GREEDY_UNTIL)
assert res == self.GREEDY_UNTIL_RES
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
def test_toc_encode(self) -> None:
res = self.LM.tok_encode("foo bar")
assert res == [12110, 2534]
def test_toc_decode(self) -> None:
res = self.LM.tok_decode([12110, 2534])
assert res == "foo bar"
def test_batch_encode(self) -> None:
res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
assert res == [[12110, 2534], [2009, 17374]]
def test_model_generate(self) -> None:
context = self.LM.tok_batch_encode(["foo bar"])[0]
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
res = self.LM.tok_decode(res[0])
assert res == "foo bar\n<bazhang>!info bar"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment