Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
835cc40e
Commit
835cc40e
authored
Dec 06, 2023
by
lintangsutawika
Browse files
merged latest and added altworld files
parents
8da401e0
c9bbec6e
Changes
430
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
131 additions
and
23 deletions
+131
-23
lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+2
-0
lm_eval/tasks/wsc273/default.yaml
lm_eval/tasks/wsc273/default.yaml
+2
-0
lm_eval/tasks/xcopa/default_et.yaml
lm_eval/tasks/xcopa/default_et.yaml
+2
-0
lm_eval/tasks/xnli/xnli_common_yaml
lm_eval/tasks/xnli/xnli_common_yaml
+2
-0
lm_eval/tasks/xstorycloze/default_ar.yaml
lm_eval/tasks/xstorycloze/default_ar.yaml
+2
-0
lm_eval/tasks/xwinograd/xwinograd_common_yaml
lm_eval/tasks/xwinograd/xwinograd_common_yaml
+2
-0
lm_eval/utils.py
lm_eval/utils.py
+63
-19
pyproject.toml
pyproject.toml
+4
-2
tests/models/test_vllm.py
tests/models/test_vllm.py
+47
-0
tests/tests_master/test_models.py
tests/tests_master/test_models.py
+5
-2
No files found.
lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
View file @
835cc40e
...
...
@@ -15,3 +15,5 @@ metric_list:
-
metric
:
!function
metrics.bleu
aggregation
:
!function
metrics.agg_bleu
higher_is_better
:
true
metadata
:
-
version
:
0.0
lm_eval/tasks/wsc273/default.yaml
View file @
835cc40e
...
...
@@ -13,3 +13,5 @@ metric_list:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
lm_eval/tasks/xcopa/default_et.yaml
View file @
835cc40e
...
...
@@ -10,3 +10,5 @@ doc_to_target: label
doc_to_choice
:
!function
utils.doc_to_choice
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
1.0
lm_eval/tasks/xnli/xnli_common_yaml
View file @
835cc40e
...
...
@@ -15,3 +15,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
lm_eval/tasks/xstorycloze/default_ar.yaml
View file @
835cc40e
...
...
@@ -14,3 +14,5 @@ metric_list:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
lm_eval/tasks/xwinograd/xwinograd_common_yaml
View file @
835cc40e
...
...
@@ -16,3 +16,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
lm_eval/utils.py
View file @
835cc40e
...
...
@@ -10,7 +10,7 @@ import collections
import
importlib.util
import
fnmatch
from
typing
import
Iterator
,
List
,
Literal
,
Union
from
typing
import
Iterator
,
List
,
Literal
,
Union
,
Any
,
Callable
import
gc
import
torch
...
...
@@ -60,7 +60,12 @@ def handle_arg_string(arg):
return
True
elif
arg
.
lower
()
==
"false"
:
return
False
return
arg
elif
arg
.
isnumeric
():
return
int
(
arg
)
try
:
return
float
(
arg
)
except
ValueError
:
return
arg
def
simple_parse_args_string
(
args_string
):
...
...
@@ -85,6 +90,32 @@ def join_iters(iters):
def
chunks
(
iter
,
n
:
int
=
0
,
fn
=
None
):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr
=
[]
for
i
,
x
in
enumerate
(
iter
):
arr
.
append
(
x
)
...
...
@@ -201,7 +232,13 @@ def make_disjoint_window(pair):
class
Reorderer
:
def
__init__
(
self
,
arr
,
fn
)
->
None
:
def
__init__
(
self
,
arr
:
List
[
Any
],
fn
:
Callable
)
->
None
:
"""Reorder an array according to some function
Args:
arr (List[Any]): The initial array
fn (Callable[[Any], Any]): A function to determine the priority of elements
"""
self
.
size
=
len
(
arr
)
arr
=
list
(
enumerate
(
arr
))
arr
=
group
(
arr
,
lambda
x
:
fn
(
x
[
1
]))
...
...
@@ -213,9 +250,22 @@ class Reorderer:
self
.
arr
=
arr
def
get_reordered
(
self
):
"""Gets the reordered array
Returns:
List[Any]: The reordered array
"""
return
[
x
[
1
]
for
x
in
self
.
arr
]
def
get_original
(
self
,
newarr
):
"""Restores the original order of a new array based on the old array's order
Args:
newarr (List[Any]): The array to be restored
Returns:
List[Any]: The array restored to the original order
"""
res
=
[
None
]
*
self
.
size
cov
=
[
False
]
*
self
.
size
...
...
@@ -296,31 +346,27 @@ def make_table(result_dict, column: str = "results"):
elif
column
==
"groups"
:
column_name
=
"Groups"
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
md_writer
.
headers
=
[
column_name
,
"Version"
,
"Filter"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
,
]
latex_writer
.
headers
=
[
all_headers
=
[
column_name
,
"Version"
,
"Filter"
,
"n-shot"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
,
]
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
md_writer
.
headers
=
all_headers
latex_writer
.
headers
=
all_headers
values
=
[]
for
k
,
dic
in
result_dict
[
column
].
items
():
version
=
result_dict
[
"versions"
][
k
]
n
=
str
(
result_dict
[
"n-shot"
][
k
])
if
"alias"
in
dic
:
k
=
dic
.
pop
(
"alias"
)
...
...
@@ -332,9 +378,9 @@ def make_table(result_dict, column: str = "results"):
if
m
+
"_stderr"
+
","
+
f
in
dic
:
se
=
dic
[
m
+
"_stderr"
+
","
+
f
]
values
.
append
([
k
,
version
,
f
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
values
.
append
([
k
,
version
,
f
,
n
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
else
:
values
.
append
([
k
,
version
,
f
,
m
,
"%.4f"
%
v
,
""
,
""
])
values
.
append
([
k
,
version
,
f
,
n
,
m
,
"%.4f"
%
v
,
""
,
""
])
k
=
""
version
=
""
md_writer
.
value_matrix
=
values
...
...
@@ -442,7 +488,6 @@ yaml.add_constructor("!function", import_function)
def
load_yaml_config
(
yaml_path
=
None
,
yaml_config
=
None
,
yaml_dir
=
None
):
if
yaml_config
is
None
:
with
open
(
yaml_path
,
"rb"
)
as
file
:
yaml_config
=
yaml
.
full_load
(
file
)
...
...
@@ -463,7 +508,6 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
include_path
.
reverse
()
final_yaml_config
=
{}
for
path
in
include_path
:
# Assumes that path is a full path.
# If not found, assume the included yaml
# is in the same dir as the original yaml
...
...
pyproject.toml
View file @
835cc40e
...
...
@@ -70,7 +70,8 @@ promptsource = [
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
anthropic
=
["anthropic"]
openai
=
[
"openai"
,
"tiktoken"
]
openai
=
[
"openai>=1.3.5"
,
"tiktoken"
]
vllm
=
["vllm"]
all
=
[
"lm_eval[dev]"
,
"lm_eval[testing]"
,
...
...
@@ -80,5 +81,6 @@ all = [
"lm_eval[promptsource]"
,
"lm_eval[gptq]"
,
"lm_eval[anthropic]"
,
"lm_eval[openai]"
"lm_eval[openai]"
,
"lm_eval[vllm]"
,
]
tests/models/test_vllm.py
0 → 100644
View file @
835cc40e
import
pytest
from
typing
import
List
from
lm_eval.api.instance
import
Instance
import
lm_eval.tasks
as
tasks
import
sys
import
torch
@
pytest
.
mark
.
skip
(
reason
=
"requires CUDA"
)
class
TEST_VLLM
:
vllm
=
pytest
.
importorskip
(
"vllm"
)
try
:
from
lm_eval.models.vllm_causallms
import
VLLM
LM
=
VLLM
(
pretrained
=
"EleutherAI/pythia-70m"
)
except
ModuleNotFoundError
:
pass
torch
.
use_deterministic_algorithms
(
True
)
tasks
.
initialize_tasks
()
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
List
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
List
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
tasks
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
List
[
Instance
]
=
rolling_task
.
instances
# TODO: make proper tests
def
test_logliklihood
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood
(
self
.
MULTIPLE_CH
)
assert
len
(
res
)
==
len
(
self
.
MULTIPLE_CH
)
for
x
in
res
:
assert
isinstance
(
x
[
0
],
float
)
def
test_generate_until
(
self
)
->
None
:
res
=
self
.
LM
.
generate_until
(
self
.
generate_until
)
assert
len
(
res
)
==
len
(
self
.
generate_until
)
for
x
in
res
:
assert
isinstance
(
x
,
str
)
def
test_logliklihood_rolling
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
for
x
in
res
:
assert
isinstance
(
x
,
float
)
tests/tests_master/test_models.py
View file @
835cc40e
import
hashlib
import
json
import
openai
import
os
import
pickle
import
pytest
...
...
@@ -8,6 +7,10 @@ import unittest.mock as mock
import
lm_eval.models
as
models
from
openai
import
OpenAI
client
=
OpenAI
()
LOGLIKELIHOOD_TEST_CASES
=
[
(
"The quick brown fox jumps over the lazy"
,
" dog"
),
...
...
@@ -172,7 +175,7 @@ def openai_mock_completion(**kwargs):
if
os
.
path
.
exists
(
fname
):
with
open
(
fname
,
"rb"
)
as
fh
:
return
pickle
.
load
(
fh
)
ret
=
openai
.
C
ompletion
.
create
(
**
kwargs
)
ret
=
client
.
c
ompletion
s
.
create
(
**
kwargs
)
ret
.
api_key
=
""
with
open
(
fname
,
"wb"
)
as
fh
:
pickle
.
dump
(
ret
,
fh
)
...
...
Prev
1
…
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment