Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f71d56eb
Commit
f71d56eb
authored
Aug 21, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into superglue
parents
33f2f9bf
2f870265
Changes
163
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
159 additions
and
22 deletions
+159
-22
setup.py
setup.py
+24
-18
templates/new_yaml_task/README.md
templates/new_yaml_task/README.md
+10
-4
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+125
-0
No files found.
setup.py
View file @
f71d56eb
import
setuptools
import
setuptools
import
itertools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
long_description
=
fh
.
read
()
long_description
=
fh
.
read
()
extras_require
=
{
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
"openai"
:
[
"openai"
,
"tiktoken"
],
}
extras_require
[
"all"
]
=
list
(
itertools
.
chain
.
from_iterable
(
extras_require
.
values
()))
setuptools
.
setup
(
setuptools
.
setup
(
name
=
"lm_eval"
,
name
=
"lm_eval"
,
version
=
"1.0.0"
,
version
=
"1.0.0"
,
...
@@ -50,22 +73,5 @@ setuptools.setup(
...
@@ -50,22 +73,5 @@ setuptools.setup(
"transformers>=4.1"
,
"transformers>=4.1"
,
"zstandard"
,
"zstandard"
,
],
],
extras_require
=
{
extras_require
=
extras_require
,
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
"openai"
:
[
"openai"
,
"tiktoken"
],
},
)
)
templates/new_yaml_task/README.md
View file @
f71d56eb
...
@@ -2,7 +2,8 @@
...
@@ -2,7 +2,8 @@
### Paper
### Paper
Title:
`paper title goes here`
Title:
`paper titles goes here`
Abstract:
`link to paper PDF or arXiv abstract goes here`
Abstract:
`link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:`
`Short description of paper / benchmark goes here:`
...
@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
...
@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here
BibTeX-formatted citation goes here
```
```
### Subtasks
### Groups and Tasks
#### Groups
*
`group_name`
:
`Short description`
#### Tasks
List or describe tasks defined in this folder, and their names here:
*
`task_name`
:
`1-sentence description of what this particular task does`
*
`task_name`
:
`1-sentence description of what this particular task does`
*
`task_name2`
: ...
..
*
`task_name2`
: ...
### Checklist
### Checklist
...
...
tests/models/test_huggingface.py
0 → 100644
View file @
f71d56eb
from
__future__
import
annotations
import
pytest
import
numpy
as
np
from
lm_eval.models.huggingface
import
HFLM
from
lm_eval.api.instance
import
Instance
import
lm_eval.tasks
as
tasks
class
Test_HFLM
:
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
greedy_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k_yaml"
)()
# type: ignore
greedy_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
greedy_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
GREEDY_UNTIL
:
list
[
Instance
]
=
greedy_until_task
.
instances
rolling_task
=
tasks
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
list
[
Instance
]
=
rolling_task
.
instances
MULTIPLE_CH_RES
=
[
-
41.902435302734375
,
-
42.939308166503906
,
-
33.914180755615234
,
-
37.07139205932617
,
-
22.95258331298828
,
-
20.342208862304688
,
-
14.818366050720215
,
-
27.942853927612305
,
-
15.80704116821289
,
-
15.936427116394043
,
-
13.052018165588379
,
-
18.04828453063965
,
-
13.345029830932617
,
-
13.366025924682617
,
-
12.127134323120117
,
-
11.872495651245117
,
-
47.10598373413086
,
-
47.76410675048828
,
-
36.4406852722168
,
-
50.0289421081543
,
-
16.72093963623047
,
-
18.535587310791016
,
-
26.46993637084961
,
-
20.355995178222656
,
-
17.757919311523438
,
-
21.80595588684082
,
-
33.1990852355957
,
-
39.28636932373047
,
-
14.759679794311523
,
-
16.753942489624023
,
-
11.486852645874023
,
-
15.42177677154541
,
-
13.15798282623291
,
-
15.887393951416016
,
-
15.28614616394043
,
-
12.339089393615723
,
-
44.59441375732422
,
-
55.40888214111328
,
-
52.70050811767578
,
-
56.25089645385742
,
]
GREEDY_UNTIL_RES
=
[
" The average of $2.50 each is $"
,
" A robe takes 2 bolts of blue fiber and half"
,
" $50,000 in repairs."
,
" He runs 1 sprint 3 times a week."
,
" They feed each of her chickens three cups of mixed"
,
" The price of the glasses is $5, but"
,
" The total percentage of students who said they like to"
,
" Carla is downloading a 200 GB file. Normally"
,
" John drives for 3 hours at a speed of 60"
,
" Eliza sells 4 tickets to 5 friends so she"
,
]
ROLLING_RES
=
[
-
3603.6328125
,
-
19779.23974609375
,
-
8834.16455078125
,
-
27967.591796875
,
-
7636.794982910156
,
-
9491.93505859375
,
-
41043.4248046875
,
-
8397.689819335938
,
-
45969.47155761719
,
-
7158.90625
,
]
LM
=
HFLM
(
pretrained
=
"EleutherAI/pythia-70m"
,
device
=
"cpu"
,
dtype
=
"float32"
)
def
test_logliklihood
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood
(
self
.
MULTIPLE_CH
)
_RES
,
_res
=
self
.
MULTIPLE_CH_RES
,
[
r
[
0
]
for
r
in
res
]
# change atol in case of consistent failure
assert
np
.
allclose
(
_res
,
_RES
,
atol
=
1e-4
)
# check indices for Multiple Choice
argmax_RES
,
argmax_res
=
np
.
argmax
(
np
.
array
(
_RES
).
reshape
(
-
1
,
4
),
axis
=
1
),
np
.
argmax
(
np
.
array
(
_res
).
reshape
(
-
1
,
4
),
axis
=
1
)
assert
(
argmax_RES
==
argmax_res
).
all
()
def
test_greedy_until
(
self
)
->
None
:
res
=
self
.
LM
.
greedy_until
(
self
.
GREEDY_UNTIL
)
assert
res
==
self
.
GREEDY_UNTIL_RES
def
test_logliklihood_rolling
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-2
)
def
test_toc_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
assert
res
==
[
12110
,
2534
]
def
test_toc_decode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_decode
([
12110
,
2534
])
assert
res
==
"foo bar"
def
test_batch_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
,
"bar foo"
])[
0
].
tolist
()
assert
res
==
[[
12110
,
2534
],
[
2009
,
17374
]]
def
test_model_generate
(
self
)
->
None
:
context
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
])[
0
]
res
=
self
.
LM
.
_model_generate
(
context
,
max_length
=
10
,
stop
=
[
"
\n\n
"
])
res
=
self
.
LM
.
tok_decode
(
res
[
0
])
assert
res
==
"foo bar
\n
<bazhang>!info bar"
Prev
1
…
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment