Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
0cdd730e
Unverified
Commit
0cdd730e
authored
Aug 19, 2023
by
Hailey Schoelkopf
Committed by
GitHub
Aug 19, 2023
Browse files
Merge pull request #791 from baberabb/big-refactor_hgtest
[Refactor] Added HF model test
parents
1c5a73c9
51882c1e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
133 additions
and
16 deletions
+133
-16
.github/workflows/new_tasks.yml
.github/workflows/new_tasks.yml
+1
-0
.github/workflows/pull_request.yml
.github/workflows/pull_request.yml
+0
-13
.github/workflows/unit_tests.yml
.github/workflows/unit_tests.yml
+7
-3
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+125
-0
No files found.
.github/workflows/new_tasks.yml
View file @
0cdd730e
...
@@ -50,6 +50,7 @@ jobs:
...
@@ -50,6 +50,7 @@ jobs:
uses
:
actions/setup-python@v4
uses
:
actions/setup-python@v4
with
:
with
:
python-version
:
3.9
python-version
:
3.9
cache
:
'
pip'
-
name
:
Install dependencies
-
name
:
Install dependencies
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run
:
|
run
:
|
...
...
.github/workflows/pull_request.yml
deleted
100644 → 0
View file @
1c5a73c9
name
:
Pull Request
on
:
[
pull_request
]
jobs
:
pre-commit
:
runs-on
:
ubuntu-20.04
steps
:
-
uses
:
actions/checkout@v3
-
uses
:
actions/setup-python@v4
with
:
python-version
:
3.9
-
uses
:
pre-commit/action@v2.0.3
.github/workflows/unit_tests.yml
View file @
0cdd730e
...
@@ -6,10 +6,10 @@ name: Unit Tests
...
@@ -6,10 +6,10 @@ name: Unit Tests
on
:
on
:
push
:
push
:
branches
:
branches
:
-
big-refactor
-
'
big-refactor
*'
pull_request
:
pull_request
:
branches
:
branches
:
-
big-refactor
-
'
big-refactor
*'
workflow_dispatch
:
workflow_dispatch
:
# Jobs run concurrently and steps run sequentially within a job.
# Jobs run concurrently and steps run sequentially within a job.
# jobs: linter and cpu_tests. Add more jobs/steps as required.
# jobs: linter and cpu_tests. Add more jobs/steps as required.
...
@@ -26,8 +26,11 @@ jobs:
...
@@ -26,8 +26,11 @@ jobs:
uses
:
actions/setup-python@v4
uses
:
actions/setup-python@v4
with
:
with
:
python-version
:
3.9
python-version
:
3.9
cache
:
'
pip'
-
name
:
Install dependencies
-
name
:
Install dependencies
run
:
pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
run
:
pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-
name
:
Pre-Commit
uses
:
pre-commit/action@v3.0.0
-
name
:
Lint with pylint
-
name
:
Lint with pylint
run
:
python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
run
:
python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
-
name
:
Lint with flake8
-
name
:
Lint with flake8
...
@@ -52,6 +55,7 @@ jobs:
...
@@ -52,6 +55,7 @@ jobs:
uses
:
actions/setup-python@v4
uses
:
actions/setup-python@v4
with
:
with
:
python-version
:
3.9
python-version
:
3.9
cache
:
'
pip'
-
name
:
Install dependencies
-
name
:
Install dependencies
run
:
|
run
:
|
python -m pip install --upgrade pip
python -m pip install --upgrade pip
...
@@ -60,4 +64,4 @@ jobs:
...
@@ -60,4 +64,4 @@ jobs:
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
name
:
Test with pytest
-
name
:
Test with pytest
run
:
python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
run
:
python -m pytest
--showlocals
-s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
tests/models/test_huggingface.py
0 → 100644
View file @
0cdd730e
from
__future__
import
annotations
import
pytest
import
numpy
as
np
from
lm_eval.models.huggingface
import
HFLM
from
lm_eval.api.instance
import
Instance
import
lm_eval.tasks
as
tasks
class
Test_HFLM
:
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
greedy_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k_yaml"
)()
# type: ignore
greedy_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
greedy_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
GREEDY_UNTIL
:
list
[
Instance
]
=
greedy_until_task
.
instances
rolling_task
=
tasks
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
list
[
Instance
]
=
rolling_task
.
instances
MULTIPLE_CH_RES
=
[
-
41.902435302734375
,
-
42.939308166503906
,
-
33.914180755615234
,
-
37.07139205932617
,
-
22.95258331298828
,
-
20.342208862304688
,
-
14.818366050720215
,
-
27.942853927612305
,
-
15.80704116821289
,
-
15.936427116394043
,
-
13.052018165588379
,
-
18.04828453063965
,
-
13.345029830932617
,
-
13.366025924682617
,
-
12.127134323120117
,
-
11.872495651245117
,
-
47.10598373413086
,
-
47.76410675048828
,
-
36.4406852722168
,
-
50.0289421081543
,
-
16.72093963623047
,
-
18.535587310791016
,
-
26.46993637084961
,
-
20.355995178222656
,
-
17.757919311523438
,
-
21.80595588684082
,
-
33.1990852355957
,
-
39.28636932373047
,
-
14.759679794311523
,
-
16.753942489624023
,
-
11.486852645874023
,
-
15.42177677154541
,
-
13.15798282623291
,
-
15.887393951416016
,
-
15.28614616394043
,
-
12.339089393615723
,
-
44.59441375732422
,
-
55.40888214111328
,
-
52.70050811767578
,
-
56.25089645385742
,
]
GREEDY_UNTIL_RES
=
[
" The average of $2.50 each is $"
,
" A robe takes 2 bolts of blue fiber and half"
,
" $50,000 in repairs."
,
" He runs 1 sprint 3 times a week."
,
" They feed each of her chickens three cups of mixed"
,
" The price of the glasses is $5, but"
,
" The total percentage of students who said they like to"
,
" Carla is downloading a 200 GB file. Normally"
,
" John drives for 3 hours at a speed of 60"
,
" Eliza sells 4 tickets to 5 friends so she"
,
]
ROLLING_RES
=
[
-
3603.6328125
,
-
19779.23974609375
,
-
8834.16455078125
,
-
27967.591796875
,
-
7636.794982910156
,
-
9491.93505859375
,
-
41043.4248046875
,
-
8397.689819335938
,
-
45969.47155761719
,
-
7158.90625
,
]
LM
=
HFLM
(
pretrained
=
"EleutherAI/pythia-70m"
,
device
=
"cpu"
,
dtype
=
"float32"
)
def
test_logliklihood
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood
(
self
.
MULTIPLE_CH
)
_RES
,
_res
=
self
.
MULTIPLE_CH_RES
,
[
r
[
0
]
for
r
in
res
]
# change atol in case of consistent failure
assert
np
.
allclose
(
_res
,
_RES
)
# check indices for Multiple Choice
argmax_RES
,
argmax_res
=
np
.
argmax
(
np
.
array
(
_RES
).
reshape
(
-
1
,
4
),
axis
=
1
),
np
.
argmax
(
np
.
array
(
_res
).
reshape
(
-
1
,
4
),
axis
=
1
)
assert
(
argmax_RES
==
argmax_res
).
all
()
def
test_greedy_until
(
self
)
->
None
:
res
=
self
.
LM
.
greedy_until
(
self
.
GREEDY_UNTIL
)
assert
res
==
self
.
GREEDY_UNTIL_RES
def
test_logliklihood_rolling
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-2
)
def
test_toc_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
assert
res
==
[
12110
,
2534
]
def
test_toc_decode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_decode
([
12110
,
2534
])
assert
res
==
"foo bar"
def
test_batch_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
,
"bar foo"
])[
0
].
tolist
()
assert
res
==
[[
12110
,
2534
],
[
2009
,
17374
]]
def
test_model_generate
(
self
)
->
None
:
context
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
])[
0
]
res
=
self
.
LM
.
_model_generate
(
context
,
max_length
=
10
,
stop
=
[
"
\n\n
"
])
res
=
self
.
LM
.
tok_decode
(
res
[
0
])
assert
res
==
"foo bar
\n
<bazhang>!info bar"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment