Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3e1301bb
Commit
3e1301bb
authored
Jun 04, 2024
by
lintangsutawika
Browse files
resolved merge conflict from latest version
parents
fd9cd80f
070d31df
Changes
539
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
276 additions
and
225 deletions
+276
-225
lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+18
-0
lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+18
-0
lm_eval/tasks/unitxt/unitxt_wrapper.py
lm_eval/tasks/unitxt/unitxt_wrapper.py
+46
-0
lm_eval/tasks/unitxt/xsum.yaml
lm_eval/tasks/unitxt/xsum.yaml
+3
-0
lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+3
-0
lm_eval/utils.py
lm_eval/utils.py
+25
-6
pyproject.toml
pyproject.toml
+5
-2
tests/models/test_gguf.py
tests/models/test_gguf.py
+2
-2
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+10
-6
tests/models/test_neuralmagic.py
tests/models/test_neuralmagic.py
+1
-1
tests/models/test_openvino.py
tests/models/test_openvino.py
+4
-4
tests/models/test_vllm.py
tests/models/test_vllm.py
+1
-1
tests/test_evaluator.py
tests/test_evaluator.py
+10
-7
tests/test_janitor.py
tests/test_janitor.py
+53
-114
tests/test_requests_caching.py
tests/test_requests_caching.py
+6
-7
tests/test_tasks.py
tests/test_tasks.py
+2
-5
tests/test_utils.py
tests/test_utils.py
+32
-34
tests/testyamls/test-01.yaml
tests/testyamls/test-01.yaml
+32
-30
tests/utils.py
tests/utils.py
+5
-6
No files found.
lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
0 → 100644
View file @
3e1301bb
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_ner
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
0 → 100644
View file @
3e1301bb
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_rouge
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
lm_eval/tasks/unitxt/unitxt_wrapper.py
0 → 100644
View file @
3e1301bb
try
:
from
unitxt
import
evaluate
except
ImportError
:
raise
ImportError
(
"Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
)
from
lm_eval.api.registry
import
AGGREGATION_REGISTRY
,
METRIC_REGISTRY
,
register_metric
def
unitxt_agg_metric
(
items
):
preds
=
[
pred
[
0
]
for
pred
,
_
,
_
in
items
]
refs
=
[
ref
for
_
,
ref
,
_
in
items
]
metric_name
=
items
[
0
][
2
].
replace
(
"unitxt_"
,
"metrics."
)
for
ref
in
refs
:
ref
[
"metrics"
]
=
[
metric_name
]
result_metrics
=
evaluate
(
preds
,
refs
)
return
result_metrics
[
0
][
"score"
][
"global"
][
"score"
]
AGGREGATION_REGISTRY
[
"unitxt"
]
=
unitxt_agg_metric
def
unitxt_metric
(
items
):
# This is a passthrough function
return
items
def
process_results
(
doc
,
results
):
metrics
=
doc
[
"metrics"
]
scores
=
{}
for
metric
in
metrics
:
metric
=
metric
.
replace
(
"metrics."
,
"unitxt_"
)
scores
[
metric
]
=
(
results
,
doc
,
metric
)
if
metric
not
in
METRIC_REGISTRY
:
register_metric
(
metric
=
metric
,
higher_is_better
=
True
,
output_type
=
"generate_until"
,
aggregation
=
"unitxt"
,
)(
unitxt_metric
)
return
scores
#
lm_eval/tasks/unitxt/xsum.yaml
0 → 100644
View file @
3e1301bb
include
:
unitxt_tasks.summarization.abstractive
task
:
xsum
dataset_name
:
card=cards.xsum,template=templates.summarization.abstractive.full
lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
0 → 100644
View file @
3e1301bb
include
:
unitxt_tasks.classification.multi_class
task
:
yahoo_answers_topics
dataset_name
:
card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
lm_eval/utils.py
View file @
3e1301bb
...
...
@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")
SPACING
=
" "
*
47
HIGHER_IS_BETTER_SYMBOLS
=
{
True
:
"↑"
,
False
:
"↓"
,
}
def
hash_string
(
string
:
str
)
->
str
:
return
hashlib
.
sha256
(
string
.
encode
(
"utf-8"
)).
hexdigest
()
...
...
@@ -76,6 +81,18 @@ def handle_non_serializable(o):
return
str
(
o
)
def
sanitize_list
(
sub
):
"""
Takes possible nested list and recursively converts all inner component to strings
"""
if
isinstance
(
sub
,
list
):
return
[
sanitize_list
(
item
)
for
item
in
sub
]
if
isinstance
(
sub
,
tuple
):
return
tuple
(
sanitize_list
(
item
)
for
item
in
sub
)
else
:
return
str
(
sub
)
def
simple_parse_args_string
(
args_string
):
"""
Parses something like
...
...
@@ -257,6 +274,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
"Filter"
,
"n-shot"
,
"Metric"
,
""
,
"Value"
,
""
,
"Stderr"
,
...
...
@@ -276,10 +294,8 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
for
k
in
keys
:
dic
=
result_dict
[
column
][
k
]
version
=
result_dict
[
"versions"
].
get
(
k
,
" N/A"
)
if
k
in
result_dict
[
"n-shot"
]:
n
=
str
(
result_dict
[
"n-shot"
][
k
])
else
:
n
=
" "
n
=
str
(
result_dict
.
get
(
"n-shot"
,
" "
).
get
(
k
,
" "
))
higher_is_better
=
result_dict
.
get
(
"higher_is_better"
,
{}).
get
(
k
,
{})
if
"alias"
in
dic
:
k
=
dic
.
pop
(
"alias"
)
...
...
@@ -290,13 +306,16 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
continue
if
v
!=
" "
:
v
=
"%.4f"
%
v
hib
=
HIGHER_IS_BETTER_SYMBOLS
.
get
(
higher_is_better
.
get
(
m
),
""
)
if
m
+
"_stderr"
+
","
+
f
in
dic
:
se
=
dic
[
m
+
"_stderr"
+
","
+
f
]
if
se
!=
"N/A"
:
se
=
"%.4f"
%
se
values
.
append
([
k
,
version
,
f
,
n
,
m
,
v
,
"±"
,
se
])
values
.
append
([
k
,
version
,
f
,
n
,
m
,
hib
,
v
,
"±"
,
se
])
else
:
values
.
append
([
k
,
version
,
f
,
n
,
m
,
v
,
""
,
""
])
values
.
append
([
k
,
version
,
f
,
n
,
m
,
hib
,
v
,
""
,
""
])
k
=
""
version
=
""
md_writer
.
value_matrix
=
values
...
...
pyproject.toml
View file @
3e1301bb
...
...
@@ -19,7 +19,7 @@ classifiers = [
requires-python
=
">=3.8"
license
=
{
"text"
=
"MIT"
}
dependencies
=
[
"accelerate>=0.2
1
.0"
,
"accelerate>=0.2
6
.0"
,
"evaluate"
,
"datasets>=2.16.0"
,
"evaluate>=0.4.0"
,
...
...
@@ -39,6 +39,7 @@ dependencies = [
"dill"
,
"word2number"
,
"more_itertools"
,
"shortuuid"
,
]
[tool.setuptools.packages.find]
...
...
@@ -73,9 +74,10 @@ promptsource = ["promptsource>=0.2.3"]
sentencepiece
=
["sentencepiece>=0.1.98"]
sparseml
=
["sparseml-nightly[llm]>=1.8.0.20240404"]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
vllm
=
["vllm
=
=0.
3
.2"]
vllm
=
["vllm
>
=0.
4
.2"]
zeno
=
[
"pandas"
,
"zeno-client"
]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
unitxt
=
["unitxt"]
all
=
[
"lm_eval[anthropic]"
,
"lm_eval[dev]"
,
...
...
@@ -94,6 +96,7 @@ all = [
"lm_eval[vllm]"
,
"lm_eval[zeno]"
,
"lm_eval[wandb]"
,
"lm_eval[unitxt]"
]
[tool.ruff.lint]
...
...
tests/models/test_gguf.py
View file @
3e1301bb
...
...
@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
def
gguf_completion_mock
(
base_url
=
None
,
**
kwargs
):
# Generate a hash from the parameters
hash_kwargs
=
{
"base_url"
:
base_url
,
**
kwargs
}
hash
=
hashlib
.
sha256
(
parameters_
hash
=
hashlib
.
sha256
(
json
.
dumps
(
hash_kwargs
,
sort_keys
=
True
).
encode
(
"utf-8"
)
).
hexdigest
()
fname
=
f
"./tests/testdata/gguf_test_
{
hash
}
.pkl"
fname
=
f
"./tests/testdata/gguf_test_
{
parameters_
hash
}
.pkl"
if
os
.
path
.
exists
(
fname
):
with
open
(
fname
,
"rb"
)
as
fh
:
...
...
tests/models/test_huggingface.py
View file @
3e1301bb
from
__future__
import
annotations
import
os
import
sys
from
pathlib
import
Path
import
numpy
as
np
import
torch
import
lm_eval
.tasks
as
tasks
from
lm_eval
import
tasks
from
lm_eval.api.instance
import
Instance
from
lm_eval.models.huggingface
import
HFLM
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
task_manager
=
tasks
.
TaskManager
()
TEST_STRING
=
"foo bar"
class
Test_HFLM
:
torch
.
use_deterministic_algorithms
(
True
)
...
...
@@ -107,7 +111,7 @@ class Test_HFLM:
file_path
=
dir_path
/
f
"outputs_log_
{
self
.
version_minor
}
.txt"
file_path
=
file_path
.
resolve
()
with
open
(
file_path
,
"w"
)
as
f
:
with
open
(
file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
str
(
x
)
for
x
in
_res
))
assert
np
.
allclose
(
_res
,
_RES
,
atol
=
1e-2
)
# check indices for Multiple Choice
...
...
@@ -126,19 +130,19 @@ class Test_HFLM:
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-1
)
def
test_toc_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
res
=
self
.
LM
.
tok_encode
(
TEST_STRING
)
assert
res
==
[
12110
,
2534
]
def
test_toc_decode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_decode
([
12110
,
2534
])
assert
res
==
"foo bar"
assert
res
==
TEST_STRING
def
test_batch_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
,
"bar foo"
])[
0
].
tolist
()
res
=
self
.
LM
.
tok_batch_encode
([
TEST_STRING
,
"bar foo"
])[
0
].
tolist
()
assert
res
==
[[
12110
,
2534
],
[
2009
,
17374
]]
def
test_model_generate
(
self
)
->
None
:
context
=
self
.
LM
.
tok_batch_encode
([
"foo bar"
])[
0
]
context
=
self
.
LM
.
tok_batch_encode
([
TEST_STRING
])[
0
]
res
=
self
.
LM
.
_model_generate
(
context
,
max_length
=
10
,
stop
=
[
"
\n\n
"
])
res
=
self
.
LM
.
tok_decode
(
res
[
0
])
assert
res
==
"foo bar
\n
<bazhang>!info bar"
tests/models/test_neuralmagic.py
View file @
3e1301bb
import
pytest
import
lm_eval
.evaluator
as
evaluator
from
lm_eval
import
evaluator
from
lm_eval.api.registry
import
get_model
...
...
tests/models/test_openvino.py
View file @
3e1301bb
...
...
@@ -6,7 +6,7 @@ import pytest
from
optimum.intel
import
OVModelForCausalLM
from
transformers
import
AutoTokenizer
import
lm_eval
.evaluator
as
evaluator
from
lm_eval
import
evaluator
from
lm_eval.api.registry
import
get_model
...
...
@@ -46,7 +46,7 @@ def test_evaluator(model_id, task):
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
app
end
((
-
random
.
random
(),
False
))
res
.
ext
end
(
[
(
-
random
.
random
(),
False
)
]
)
return
res
...
...
@@ -57,7 +57,7 @@ def test_evaluator(model_id, task):
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
app
end
(
-
random
.
random
())
res
.
ext
end
(
[
-
random
.
random
()
]
)
return
res
...
...
@@ -79,7 +79,7 @@ def test_ov_config():
model_id
=
"hf-internal-testing/tiny-random-gpt2"
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
config_file
=
str
(
Path
(
tmpdirname
)
/
"ov_config.json"
)
with
open
(
Path
(
config_file
),
"w"
)
as
f
:
with
open
(
Path
(
config_file
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
'{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}'
)
lm
=
get_model
(
"openvino"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
,ov_config=
{
config_file
}
"
...
...
tests/models/test_vllm.py
View file @
3e1301bb
...
...
@@ -3,7 +3,7 @@ from typing import List
import
pytest
import
torch
import
lm_eval
.tasks
as
tasks
from
lm_eval
import
tasks
from
lm_eval.api.instance
import
Instance
...
...
tests/test_evaluator.py
View file @
3e1301bb
#
import
lm_eval.base as base
import
os
from
typing
import
List
import
pytest
# import lm_eval.models as models
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
from
lm_eval
import
tasks
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@
pytest
.
mark
.
parametrize
(
"task_name,limit,model,model_args"
,
"task_name,limit,model,model_args
,bootstrap_iters
"
,
[
(
[
"arc_easy"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
0
,
),
(
[
"mmlu_abstract_algebra"
],
None
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
10000
,
),
],
)
def
test_evaluator
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
# task_name = task_name
# limit = 10
def
test_evaluator
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
,
bootstrap_iters
:
int
):
e1
=
evaluator
.
simple_evaluate
(
model
=
model
,
tasks
=
task_name
,
limit
=
limit
,
model_args
=
model_args
,
bootstrap_iters
=
bootstrap_iters
,
)
assert
e1
is
not
None
...
...
@@ -57,6 +59,7 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
lm
=
lm
,
task_dict
=
task_dict
,
limit
=
limit
,
bootstrap_iters
=
bootstrap_iters
,
)
assert
e2
is
not
None
...
...
tests/test_janitor.py
View file @
3e1301bb
import
os
from
collections
import
defaultdict
from
lm_eval.decontamination.janitor
import
(
...
...
@@ -9,23 +10,41 @@ from lm_eval.decontamination.janitor import (
)
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
TEST_SEQUENCE
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
JANITOR_EXPECTED
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
JANITOR_FILTH1
=
"filth lots of dirty filthy filth"
JANITOR_FILTH2
=
"filth lots of filthy dirty filth"
def
simple_ngram
(
sequence
,
n
):
ngrams
=
list
()
ngram
=
[]
for
x
in
sequence
:
ngram
.
app
end
(
x
)
ngram
.
ext
end
(
[
x
]
)
if
len
(
ngram
)
==
n
:
ngrams
.
app
end
(
tuple
(
ngram
))
ngrams
.
ext
end
(
[
tuple
(
ngram
)
]
)
ngram
=
ngram
[
1
:]
return
ngrams
def
test_form_ngrams
():
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence
=
TEST_SEQUENCE
n_values
=
[
1
,
2
,
3
,
5
,
13
]
for
n
in
n_values
:
...
...
@@ -36,10 +55,7 @@ def test_form_ngrams():
def
test_word_ngrams
():
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence
=
TEST_SEQUENCE
words
=
sequence
.
split
()
...
...
@@ -53,10 +69,7 @@ def test_word_ngrams():
def
test_split_indices
():
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence
=
TEST_SEQUENCE
comparison
=
[]
current_word
=
""
...
...
@@ -65,12 +78,18 @@ def test_split_indices():
current_word
+=
c
else
:
if
current_word
:
comparison
.
app
end
((
current_word
,
(
i
-
len
(
current_word
),
i
-
1
)))
comparison
.
ext
end
(
[
(
current_word
,
(
i
-
len
(
current_word
),
i
-
1
))
]
)
current_word
=
""
if
current_word
:
comparison
.
append
(
(
current_word
,
(
len
(
sequence
)
-
len
(
current_word
),
len
(
sequence
)
-
1
))
len_sequence
=
len
(
sequence
)
comparison
.
extend
(
[
(
current_word
,
(
len_sequence
-
len
(
current_word
),
len_sequence
-
1
),
)
]
)
current_word
=
""
...
...
@@ -80,10 +99,7 @@ def test_split_indices():
def
test_word_ngrams_indices
():
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence
=
TEST_SEQUENCE
n_values
=
[
1
,
2
,
3
,
5
,
13
]
...
...
@@ -100,14 +116,13 @@ def test_word_ngrams_indices():
tracker
[
ngram
]
=
end
+
1
# ignore partial word matches
if
(
start
!=
0
and
sequence
[
start
-
1
]
!=
" "
)
or
(
end
!=
len
(
sequence
)
-
1
and
sequence
[
end
+
1
]
!=
" "
if
not
(
(
start
!=
0
and
sequence
[
start
-
1
]
!=
" "
)
or
(
end
!=
len
(
sequence
)
-
1
and
sequence
[
end
+
1
]
!=
" "
)
):
pass
else
:
break
comparison
.
app
end
((
ngram
,
(
start
,
end
)))
comparison
.
ext
end
(
[
(
ngram
,
(
start
,
end
))
]
)
result_to_test
=
list
(
word_ngrams_indices
(
sequence
,
n
))
assert
len
(
result_to_test
)
==
len
(
comparison
)
...
...
@@ -184,17 +199,6 @@ def test_janitor2():
filth
=
"filth"
expected_result
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor
=
Janitor
(
ngram_n
=
1
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
...
...
@@ -207,7 +211,7 @@ def test_janitor2():
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
assert
result
==
JANITOR_EXPECTED
def
test_janitor3
():
...
...
@@ -229,19 +233,6 @@ def test_janitor3():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filth
=
"filth lots of dirty filthy filth"
expected_result
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
...
...
@@ -249,12 +240,12 @@ def test_janitor3():
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
janitor
.
register_contaminant
(
JANITOR_FILTH1
)
assert
janitor
.
dirt_ngrams
==
{
JANITOR_FILTH1
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
assert
result
==
JANITOR_EXPECTED
def
test_janitor4
():
...
...
@@ -284,19 +275,6 @@ def test_janitor4():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filth
=
"filth lots of dirty filthy filth"
expected_result
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
...
...
@@ -304,12 +282,12 @@ def test_janitor4():
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
janitor
.
register_contaminant
(
JANITOR_FILTH1
)
assert
janitor
.
dirt_ngrams
==
{
JANITOR_FILTH1
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
assert
result
==
JANITOR_EXPECTED
def
test_janitor5
():
...
...
@@ -338,18 +316,7 @@ def test_janitor5():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
expected_result
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths
=
[
JANITOR_FILTH1
,
JANITOR_FILTH2
]
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
...
...
@@ -364,7 +331,7 @@ def test_janitor5():
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
assert
result
==
JANITOR_EXPECTED
def
test_janitor6
():
...
...
@@ -401,18 +368,7 @@ def test_janitor6():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
expected_result
=
(
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths
=
[
JANITOR_FILTH1
,
JANITOR_FILTH2
]
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
...
...
@@ -427,7 +383,7 @@ def test_janitor6():
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
assert
result
==
JANITOR_EXPECTED
def
test_janitor7
():
...
...
@@ -465,7 +421,7 @@ def test_janitor7():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
filths
=
[
JANITOR_FILTH1
,
JANITOR_FILTH2
]
expected_result
=
""
...
...
@@ -488,20 +444,3 @@ def test_janitor7():
def
test_janitor8
():
# This will test the save and load contams
pass
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan = Janitor(ngram_n=3)
# jan.register_contaminant(contaminant)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# filename = "data/saved_contam"
# jan.save_contamination_ngrams(filename)
# jan = Janitor(ngram_n=3)
# jan.load_contamination_ngrams(filename)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
tests/test_requests_caching.py
View file @
3e1301bb
# import lm_eval.base as base
import
importlib
import
os
import
sys
from
datetime
import
datetime
from
typing
import
List
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
import
pytest
import
torch
# import lm_eval.models as models
from
lm_eval.caching.cache
import
PATH
...
...
@@ -43,7 +41,7 @@ def clear_cache():
# leaving tasks here to allow for the option to select specific task files
def
get_cache_files
(
tasks
:
List
[
str
]
=
None
)
->
Tuple
[
List
[
str
],
List
[
str
]]:
def
get_cache_files
(
tasks
:
Optional
[
List
[
str
]
]
=
None
)
->
Tuple
[
List
[
str
],
List
[
str
]]:
cache_files
=
os
.
listdir
(
PATH
)
file_task_names
=
[]
...
...
@@ -51,7 +49,7 @@ def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
for
file
in
cache_files
:
file_without_prefix
=
file
.
split
(
"-"
)[
1
]
file_without_prefix_and_suffix
=
file_without_prefix
.
split
(
"."
)[
0
]
file_task_names
.
app
end
(
file_without_prefix_and_suffix
)
file_task_names
.
ext
end
(
[
file_without_prefix_and_suffix
]
)
return
cache_files
,
file_task_names
...
...
@@ -113,10 +111,11 @@ if __name__ == "__main__":
# test_requests_caching_refresh,
# test_requests_caching_delete,
]
# Lookups of global names within a loop is inefficient, so copy to a local variable outside of the loop first
default_tasks
=
DEFAULT_TASKS
for
test_func
in
tests
:
clear_cache
()
test_func
(
tasks
=
DEFAULT_TASKS
)
test_func
(
tasks
=
default_tasks
)
print
(
"Tests pass"
)
...
...
tests/test_tasks.py
View file @
3e1301bb
import
os
from
itertools
import
islice
import
pytest
...
...
@@ -8,6 +9,7 @@ from lm_eval.api.task import ConfigurableTask
from
.utils
import
new_tasks
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
task_manager
=
tasks
.
TaskManager
()
# Default Task
TASKS
=
[
"arc_easy"
]
...
...
@@ -87,7 +89,6 @@ class TestNewTasks:
)
if
"multiple_choice"
in
task
.
_config
.
output_type
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
...
...
@@ -101,9 +102,6 @@ class TestNewTasks:
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
if
task
.
_config
.
output_type
==
"multiple_choice"
:
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def
test_build_all_requests
(
self
,
task_class
,
limit
):
task_class
.
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
...
...
@@ -118,5 +116,4 @@ class TestNewTasks:
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
# assert all(isinstance(doc, list) for doc in requests)
assert
len
(
requests
)
==
limit
if
limit
else
True
tests/test_utils.py
View file @
3e1301bb
...
...
@@ -41,7 +41,7 @@ def test_get_rolling_token_windows_v1():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -70,7 +70,7 @@ def test_get_rolling_token_windows_v2():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -115,7 +115,7 @@ def test_get_rolling_token_windows_v3():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -156,7 +156,7 @@ def test_get_rolling_token_windows_v4():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -185,7 +185,7 @@ def test_get_rolling_token_windows_v5():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -210,7 +210,7 @@ def test_get_rolling_token_windows_v6():
pred_length
=
0
output
=
[]
for
input_tokens
,
pred_tokens
in
generator
:
output
.
app
end
((
input_tokens
,
pred_tokens
))
output
.
ext
end
(
[
(
input_tokens
,
pred_tokens
)
]
)
pred_length
+=
len
(
pred_tokens
)
assert
pred_length
==
len
(
x
)
assert
gold
==
output
...
...
@@ -273,26 +273,26 @@ class TestCollator:
generation_samples
=
self
.
make_generate_sample
(
int
(
end
))
gens
=
Collator
(
generation_samples
,
_collate_gen
,
group_by
=
"gen_kwargs"
)
chunks
=
gens
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
chunks
_gen
=
gens
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
group_one
=
end
//
2
group_two
=
end
-
end
//
2
is_batch
=
batch_size
!=
0
for
chunks
in
chunks_gen
:
# check batching
group_one
=
end
//
2
group_two
=
end
-
end
//
2
assert
(
len
(
chunks
)
<=
batch_size
if
batch
_size
!=
0
if
is_
batch
else
len
(
chunks
)
in
[
group_one
,
group_two
]
)
# check if reorder-er is working correctly
assert
all
(
len
(
chunks
[
i
][
0
])
<=
len
(
chunks
[
i
-
1
][
0
])
for
i
in
range
(
1
,
len
(
chunks
))
)
chunk_lengths
=
[
len
(
chunk
[
0
])
for
chunk
in
chunks
]
assert
chunk_lengths
==
sorted
(
chunk_lengths
,
reverse
=
True
)
# check if grouping correctly
assert
all
(
x
[
1
]
==
chunks
[
0
][
1
]
for
x
in
chunks
)
chunk_to_compare
=
chunks
[
0
][
1
]
assert
all
(
x
[
1
]
==
chunk_to_compare
for
x
in
chunks
)
for
x
in
chunks
:
output
.
app
end
(
x
)
output
.
ext
end
(
[
x
]
)
reordered_output
=
gens
.
get_original
(
output
)
# check get original
assert
reordered_output
==
generation_samples
...
...
@@ -305,18 +305,17 @@ class TestCollator:
loglikelihood_samples
,
_collate_log
,
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
chunks
_gen
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
is_batch
=
batch_size
!=
0
for
chunks
in
chunks_gen
:
# check batching
assert
len
(
chunks
)
<=
batch_size
if
batch
_size
!=
0
else
len
(
chunks
)
==
end
assert
len
(
chunks
)
<=
batch_size
if
is_
batch
else
len
(
chunks
)
==
end
# check reorder
assert
all
(
len
(
chunks
[
i
][
1
])
<=
len
(
chunks
[
i
-
1
][
1
])
for
i
in
range
(
1
,
len
(
chunks
))
)
chunk_lengths
=
[
len
(
chunk
[
1
])
for
chunk
in
chunks
]
assert
chunk_lengths
==
sorted
(
chunk_lengths
,
reverse
=
True
)
for
x
in
chunks
:
output
.
app
end
(
x
[
1
])
output
.
ext
end
(
[
x
[
1
]
]
)
# check indices
reordered_output
=
loglikelihoods
.
get_original
(
output
)
assert
reordered_output
==
[
x
[
1
]
for
x
in
loglikelihood_samples
]
...
...
@@ -335,18 +334,17 @@ class TestCollator:
group_fn
=
lambda
a
:
a
[
-
2
]
+
a
[
-
1
][:
-
1
],
group_by
=
"contexts"
,
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
chunks
_gen
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
outputs_
=
[]
for
chunks
in
chunks
:
is_batch
=
batch_size
!=
0
for
chunks
in
chunks_gen
:
# check batching
if
batch
_size
!=
0
:
if
is_
batch
:
assert
len
(
chunks
)
<=
batch_size
# check reorder
assert
all
(
len
(
chunks
[
i
][
1
])
<=
len
(
chunks
[
i
-
1
][
1
])
for
i
in
range
(
1
,
len
(
chunks
))
)
chunk_lengths
=
[
len
(
chunk
[
1
])
for
chunk
in
chunks
]
assert
chunk_lengths
==
sorted
(
chunk_lengths
,
reverse
=
True
)
for
x
in
chunks
:
for
request_str
,
cont_toks
,
logits
in
loglikelihoods
.
get_cache
(
req_str
=
""
.
join
(
x
[
0
]),
...
...
@@ -356,8 +354,8 @@ class TestCollator:
.
unsqueeze
(
0
)
.
unsqueeze
(
0
),
):
output
.
app
end
(
x
[
1
])
outputs_
.
app
end
(
cont_toks
)
output
.
ext
end
(
[
x
[
1
]
]
)
outputs_
.
ext
end
(
[
cont_toks
]
)
assert
len
(
output
)
==
len
(
outputs_
)
# check indices
reordered_output
=
loglikelihoods
.
get_original
(
output
)
...
...
tests/testyamls/test-01.yaml
View file @
3e1301bb
...
...
@@ -3,12 +3,12 @@ group_alias: test 1
task
:
-
piqa
# string task
-
ai2_arc
# string tag
-
task
:
super-glue-lm-eval-v1
# Should this be spread out?
num_fewshot
:
3
#
- task: super-glue-lm-eval-v1 # Should this be spread out?
#
num_fewshot: 3
-
task
:
swag
# dict registered task
num_fewshot
:
2
-
task
:
mmlu
num_fewshot
:
5
#
- task: mmlu
#
num_fewshot: 5
-
group
:
nli-tasks
# dict group
task
:
-
anli
...
...
@@ -17,29 +17,31 @@ task:
num_fewshot
:
4
metric_list
:
-
metric
:
brier_score
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 2-shot
num_fewshot
:
2
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 4-shot
num_fewshot
:
4
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 6-shot
num_fewshot
:
6
-
task
:
siqa_custom
# dict task
dataset_path
:
social_i_qa
dataset_name
:
null
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
Question:
{{context}}
{{question}}
\n
Answer:"
target_delimiter
:
"
"
doc_to_choice
:
-
"
{{answerA}}"
-
"
{{answerB}}"
-
"
{{answerC}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
aggregate_metric
:
true
# - task: sciq # dict registered task duplicate
# task_alias: sciq 2-shot
# num_fewshot: 2
# - task: sciq # dict registered task duplicate
# task_alias: sciq 4-shot
# num_fewshot: 4
# - task: sciq # dict registered task duplicate
# task_alias: sciq 6-shot
# num_fewshot: 6
# - task: siqa_custom # dict task
# dataset_path: social_i_qa
# dataset_name: null
# output_type: multiple_choice
# training_split: train
# validation_split: validation
# doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
# target_delimiter: " "
# doc_to_choice:
# - "{{answerA}}"
# - "{{answerB}}"
# - "{{answerC}}"
# doc_to_target: "{{ (label|int) - 1 }}"
# metric_list:
# - metric: acc
# aggregation: mean
# higher_is_better: true
tests/utils.py
View file @
3e1301bb
...
...
@@ -12,9 +12,9 @@ from lm_eval.utils import load_yaml_config
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def
load_changed_files
(
file_path
:
str
)
->
List
[
str
]:
with
open
(
file_path
,
"r"
)
as
f
:
with
open
(
file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
content
=
f
.
read
()
words_list
=
[
x
for
x
in
content
.
split
()
]
words_list
=
list
(
content
.
split
()
)
return
words_list
...
...
@@ -25,7 +25,7 @@ def load_changed_files(file_path: str) -> List[str]:
def
parser
(
full_path
:
List
[
str
])
->
List
[
str
]:
_output
=
set
()
for
x
in
full_path
:
if
os
.
path
.
exists
(
x
)
and
x
.
endswith
(
".yaml"
):
if
x
.
endswith
(
".yaml"
)
and
os
.
path
.
exists
(
x
):
config
=
load_yaml_config
(
x
,
mode
=
"simple"
)
if
isinstance
(
config
[
"task"
],
str
):
_output
.
add
(
config
[
"task"
])
...
...
@@ -40,10 +40,9 @@ def new_tasks() -> Union[List[str], None]:
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return
parser
(
load_changed_files
(
FILENAME
))
el
if
os
.
getenv
(
"API"
)
is
not
None
:
if
os
.
getenv
(
"API"
)
is
not
None
:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return
[
"arc_easy"
,
"hellaswag"
,
"piqa"
,
"wikitext"
]
# if both not true just do arc_easy
else
:
return
return
None
Prev
1
…
23
24
25
26
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment