Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f66fc06f
Commit
f66fc06f
authored
Feb 01, 2024
by
haileyschoelkopf
Browse files
fix merge conflicts
parents
b13753cd
d714fc95
Changes
84
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
213 additions
and
131 deletions
+213
-131
lm_eval/tasks/qasper/utils.py
lm_eval/tasks/qasper/utils.py
+1
-1
lm_eval/tasks/scrolls/scrolls.yaml
lm_eval/tasks/scrolls/scrolls.yaml
+14
-7
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+7
-11
lm_eval/tasks/squadv2/squadv2.yaml
lm_eval/tasks/squadv2/squadv2.yaml
+2
-0
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+0
-2
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+2
-5
lm_eval/tasks/super_glue/wsc/t5_utils.py
lm_eval/tasks/super_glue/wsc/t5_utils.py
+56
-60
lm_eval/tasks/xwinograd/utils.py
lm_eval/tasks/xwinograd/utils.py
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+12
-5
pyproject.toml
pyproject.toml
+4
-5
scripts/build_benchmark.py
scripts/build_benchmark.py
+2
-2
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+2
-2
scripts/clean_training_data/investigate_pile.py
scripts/clean_training_data/investigate_pile.py
+2
-2
scripts/make_table_results.py
scripts/make_table_results.py
+3
-3
scripts/make_table_tasks.py
scripts/make_table_tasks.py
+3
-4
scripts/regression.py
scripts/regression.py
+5
-1
scripts/write_out.py
scripts/write_out.py
+7
-8
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+12
-8
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+5
-4
tests/models/test_openvino.py
tests/models/test_openvino.py
+73
-0
No files found.
lm_eval/tasks/qasper/utils.py
View file @
f66fc06f
...
...
@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
obs_list
[
"abstract"
].
append
(
abstract
)
obs_list
[
"question"
].
append
(
question
)
obs_list
[
"answer_type"
].
append
(
answer_type
)
if
typ
e
(
answer
)
==
list
:
if
isinstanc
e
(
answer
,
list
)
:
answer
=
", "
.
join
(
answer
)
obs_list
[
"answer"
].
append
(
answer
)
...
...
lm_eval/tasks/scrolls/scrolls.yaml
View file @
f66fc06f
group
:
scrolls
task
:
-
scrolls_qasper
-
scrolls_quality
-
scrolls_narrativeqa
-
scrolls_contractnli
-
scrolls_govreport
-
scrolls_summscreenfd
-
scrolls_qmsum
-
task
:
scrolls_qasper
class
:
!function
task.Qasper
-
task
:
scrolls_quality
class
:
!function
task.QuALITY
-
task
:
scrolls_narrativeqa
class
:
!function
task.NarrativeQA
-
task
:
scrolls_contractnli
class
:
!function
task.ContractNLI
-
task
:
scrolls_govreport
class
:
!function
task.GovReport
-
task
:
scrolls_summscreenfd
class
:
!function
task.SummScreenFD
-
task
:
scrolls_qmsum
class
:
!function
task.QMSum
lm_eval/tasks/scrolls/task.py
View file @
f66fc06f
...
...
@@ -115,7 +115,9 @@ class _SCROLLSTask(Task):
PRUNE_MAX_TOKENS
=
None
PRUNE_NUM_PROC
=
None
def
__post_init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
if
self
.
DATASET_NAME
is
not
None
:
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
...
...
@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"gold"
]
acc
=
1.0
if
np
.
argmax
(
results
)
==
gold
else
0.0
lls
,
_
=
zip
(
*
results
)
acc
=
1.0
if
np
.
argmax
(
lls
)
==
gold
else
0.0
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.0
if
np
.
argmax
(
result
s
/
completion_len
)
==
gold
else
0.0
acc_norm
=
1.0
if
np
.
argmax
(
ll
s
/
completion_len
)
==
gold
else
0.0
return
{
"acc"
:
acc
,
...
...
@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
return
f
"
{
doc
[
'input'
]
}
\n\n
Question: What is a summary of the preceding text?
\n
Answer:"
@
register_task
(
"scrolls_qasper"
)
class
Qasper
(
_SCROLLSTask
):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
...
...
@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
)
@
register_task
(
"scrolls_quality"
)
class
QuALITY
(
_SCROLLSMultipleChoiceTask
):
"""QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608
...
...
@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
return
[
doc
]
@
register_task
(
"scrolls_narrativeqa"
)
class
NarrativeQA
(
_SCROLLSTask
):
"""The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040
...
...
@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
)
@
register_task
(
"scrolls_contractnli"
)
class
ContractNLI
(
_SCROLLSMultipleChoiceTask
):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040
...
...
@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
return
f
"
{
doc
[
'text'
]
}
\n\n
Hypothesis:
{
doc
[
'question'
]
}
\n
Conclusion:"
@
register_task
(
"scrolls_govreport"
)
class
GovReport
(
_SCROLLSSummaryTask
):
"""Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112
...
...
@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
DATASET_NAME
=
"gov_report"
@
register_task
(
"scrolls_summscreenfd"
)
class
SummScreenFD
(
_SCROLLSSummaryTask
):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091
...
...
@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
DATASET_NAME
=
"summ_screen_fd"
@
register_task
(
"scrolls_qmsum"
)
class
QMSum
(
_SCROLLSSummaryTask
):
"""QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization
...
...
lm_eval/tasks/squadv2/squadv2.yaml
0 → 100644
View file @
f66fc06f
task
:
squadv2
class
:
!function
task.SQuAD2
lm_eval/tasks/squadv2/task.py
View file @
f66fc06f
...
...
@@ -21,7 +21,6 @@ from packaging import version
from
lm_eval.api.task
import
Task
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
_CITATION
=
"""
@misc{rajpurkar2018know,
...
...
@@ -47,7 +46,6 @@ def _squad_agg(key, items):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
key
,
0
)
@
register_task
(
"squadv2"
)
class
SQuAD2
(
Task
):
VERSION
=
3
DATASET_PATH
=
"squad_v2"
...
...
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
View file @
f66fc06f
...
...
@@ -7,6 +7,7 @@ training_split: train
validation_split
:
validation
output_type
:
generate_until
doc_to_text
:
!function
"
t5_utils.doc_to_text"
process_results
:
!function
"
t5_utils.process_results"
doc_to_target
:
label
generation_kwargs
:
until
:
...
...
@@ -15,9 +16,5 @@ metric_list:
-
metric
:
accuracy
aggregation
:
mean
higher_is_better
:
true
filter_list
:
-
name
:
"
wsc_postprocessor"
filter
:
-
function
:
!function
t5_utils.WSCPostprocess
metadata
:
version
:
0
.0
version
:
1
.0
lm_eval/tasks/super_glue/wsc/t5_utils.py
View file @
f66fc06f
import
re
from
lm_eval.api.filter
import
Filter
from
typing
import
List
def
doc_to_text
(
x
):
text
=
re
.
sub
(
r
" X "
,
" *"
+
x
[
"span2_text"
]
+
"* "
,
_wsc_inputs
(
x
))
...
...
@@ -24,7 +23,7 @@ def _wsc_inputs(x):
[
" "
.
join
(
words
[:
pronoun_index
]),
"X"
,
" "
.
join
(
words
[
pronoun_index
+
1
:]),
" "
.
join
(
words
[
pronoun_index
+
1
:]),
]
)
...
...
@@ -52,9 +51,7 @@ def _wsc_inputs(x):
return
create_input
()
class
WSCPostprocess
(
Filter
):
def
__init__
(
self
,
**
kwargs
):
self
.
determiners
=
{
DETERMINERS
=
{
"a"
,
"an"
,
"few"
,
...
...
@@ -76,18 +73,18 @@ class WSCPostprocess(Filter):
"which"
,
"whose"
,
"your"
,
}
}
def
clean
(
s
elf
,
s
)
:
def
clean
(
s
:
str
)
->
str
:
"""Ignore capitalization and determiners."""
s
=
s
.
strip
().
lower
()
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
self
.
determiners
])
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
DETERMINERS
])
def
apply
(
self
,
resps
,
docs
):
filtered_resps
=
[]
for
prediction
,
reference
in
zip
(
*
(
resps
,
docs
[
"span1_text"
])):
prediction
=
self
.
clean
(
prediction
[
0
])
reference
=
self
.
clean
(
reference
)
def
process_results
(
docs
:
dict
,
resps
:
List
):
prediction
=
clean
(
resps
[
0
])
reference
=
clean
(
docs
[
"span1_text"
])
if
(
"'"
in
prediction
)
!=
(
"'"
in
reference
):
# referent is "Bob's hat" as predicting the referent.
...
...
@@ -102,6 +99,5 @@ class WSCPostprocess(Filter):
referent_words
)
or
referent_words
.
issubset
(
prediction_words
)
filtered_resps
.
append
(
predicted_referent
)
return
filtered_resps
acc
=
1.0
if
predicted_referent
==
docs
[
"label"
]
else
0.0
return
{
"accuracy"
:
acc
}
lm_eval/tasks/xwinograd/utils.py
View file @
f66fc06f
...
...
@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for
lang
in
LANGUAGES
:
file_name
=
f
"xwinograd_
{
lang
}
.yaml"
try
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
)
as
f
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by utils.py
\n
"
)
yaml
.
dump
(
{
...
...
lm_eval/utils.py
View file @
f66fc06f
...
...
@@ -472,6 +472,10 @@ def get_git_commit_hash():
return
git_hash
def
ignore_constructor
(
loader
,
node
):
return
node
def
import_function
(
loader
,
node
):
function_name
=
loader
.
construct_scalar
(
node
)
yaml_path
=
os
.
path
.
dirname
(
loader
.
name
)
...
...
@@ -489,11 +493,14 @@ def import_function(loader, node):
return
function
# Add the import_function constructor to the YAML loader
yaml
.
add_constructor
(
"!function"
,
import_function
)
def
load_yaml_config
(
yaml_path
=
None
,
yaml_config
=
None
,
yaml_dir
=
None
,
mode
=
"full"
):
if
mode
==
"simple"
:
constructor_fn
=
ignore_constructor
elif
mode
==
"full"
:
constructor_fn
=
import_function
def
load_yaml_config
(
yaml_path
=
None
,
yaml_config
=
None
,
yaml_dir
=
None
):
# Add the import_function constructor to the YAML loader
yaml
.
add_constructor
(
"!function"
,
constructor_fn
)
if
yaml_config
is
None
:
with
open
(
yaml_path
,
"rb"
)
as
file
:
yaml_config
=
yaml
.
full_load
(
file
)
...
...
@@ -521,7 +528,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
path
=
os
.
path
.
join
(
yaml_dir
,
path
)
try
:
included_yaml_config
=
load_yaml_config
(
path
)
included_yaml_config
=
load_yaml_config
(
yaml_path
=
path
,
mode
=
mode
)
final_yaml_config
.
update
(
included_yaml_config
)
except
Exception
as
ex
:
# If failed to load, ignore
...
...
pyproject.toml
View file @
f66fc06f
...
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"0.4.
0
"
version
=
"0.4.
1
"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
...
...
@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
anthropic
=
["anthropic"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
gptq
=
["auto-gptq[triton]
>=0.6.0
"]
ifeval
=
[
"langdetect"
,
"immutabledict"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
openai
=
[
"openai==1.3.9"
,
"tiktoken"
]
promptsource
=
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
optimum
=
["optimum[openvino]"]
promptsource
=
["promptsource>=0.2.3"]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
vllm
=
["vllm<=0.2.5"]
...
...
scripts/build_benchmark.py
View file @
f66fc06f
...
...
@@ -23,7 +23,7 @@ def parse_args():
if
__name__
==
"__main__"
:
args
=
parse_args
()
with
open
(
args
.
benchmark_path
)
as
file
:
with
open
(
args
.
benchmark_path
,
encoding
=
"utf-8"
)
as
file
:
TASK_LIST
=
yaml
.
full_load
(
file
)
for
task
in
tqdm
(
TASK_LIST
):
eval_logger
.
info
(
f
"Processing
{
task
}
"
)
...
...
@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path
=
os
.
path
.
join
(
file_path
,
full_file_name
)
eval_logger
.
info
(
f
"Save to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
config_dict
,
yaml_file
)
scripts/clean_training_data/generate_13_grams.py
View file @
f66fc06f
...
...
@@ -119,7 +119,7 @@ class Buckets:
def
do_ngrams_in_buckets
(
n_value
,
working_directory
,
bucket_count
):
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
))
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
,
encoding
=
"utf-8"
))
pile_document_count
=
pile_statistics
[
"Document Count"
]
start_offsets
=
pile_statistics
[
"File Start Offsets"
]
...
...
@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict
=
{
"title"
:
"dataset ngrams"
,
"ngram_size"
:
13
}
info_dict_path
=
os
.
path
.
join
(
args
.
working_directory
,
"info.json"
)
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
))
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
,
encoding
=
"utf-8"
))
scripts/clean_training_data/investigate_pile.py
View file @
f66fc06f
...
...
@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path
=
"pile_statistics.json"
if
os
.
path
.
exists
(
stats_file_path
):
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
))
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
,
encoding
=
"utf-8"
))
else
:
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
stats
=
{
...
...
@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters"
:
total_document_size_chars
,
"File Start Offsets"
:
start_offsets
,
}
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
),
indent
=
4
)
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
,
encoding
=
"utf-8"
),
indent
=
4
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
...
...
scripts/make_table_results.py
View file @
f66fc06f
...
...
@@ -61,14 +61,14 @@ if __name__ == "__main__":
if
not
filenames
:
continue
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
with
open
(
path_readme
,
"w"
)
as
f
:
with
open
(
path_readme
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
# get path name, only last folder
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
path
=
os
.
path
.
join
(
dirpath
,
filename
)
with
open
(
path
,
"r"
)
as
f
:
with
open
(
path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
result_dict
=
json
.
load
(
f
)
with
open
(
path_readme
,
"a"
)
as
f
:
with
open
(
path_readme
,
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
scripts/make_table_tasks.py
View file @
f66fc06f
...
...
@@ -11,14 +11,13 @@ import datasets
import
pandas
as
pd
from
lm_eval
import
tasks
from
lm_eval.tasks
import
TASK_REGISTRY
from
lm_eval.utils
import
load_yaml_config
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
datasets
.
disable_caching
()
task
s
.
initialize_tasks
()
task
_manager
=
tasks
.
TaskManager
def
load_changed_files
(
file_path
:
str
)
->
List
[
str
]:
...
...
@@ -74,11 +73,11 @@ def maketable(df):
]
values
=
[]
if
not
df
:
_tasks
=
task
s
.
TASK_REGISTRY
.
items
()
_tasks
=
task
_manager
.
TASK_REGISTRY
.
items
()
_tasks
=
sorted
(
_tasks
,
key
=
lambda
x
:
x
[
0
])
else
:
task_classes
=
new_tasks
()
_tasks
=
[(
x
,
TASK_REGISTRY
.
get
(
x
))
for
x
in
task_classes
]
_tasks
=
[(
x
,
task_manager
.
TASK_REGISTRY
.
get
(
x
))
for
x
in
task_classes
]
count
=
0
for
tname
,
Task
in
_tasks
:
task
=
Task
()
...
...
scripts/regression.py
View file @
f66fc06f
...
...
@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
results
[
model
]
=
(
json
.
load
(
open
(
output_path
,
encoding
=
"utf-8"
))
if
ret
==
0
else
{
"results"
:
{}}
)
end_time
=
time
.
time
()
...
...
scripts/write_out.py
View file @
f66fc06f
...
...
@@ -5,7 +5,7 @@ import random
import
numpy
as
np
from
lm_eval
import
tasks
from
lm_eval.tasks
import
include_path
,
initialize_tasks
from
lm_eval.tasks
import
TaskManager
from
lm_eval.utils
import
eval_logger
,
join_iters
...
...
@@ -39,22 +39,21 @@ def main():
args
=
parse_args
()
np
.
random
.
seed
(
args
.
seed
)
initialize_tasks
(
args
.
verbosity
)
if
args
.
include_path
is
not
None
:
eval_logger
.
info
(
f
"Including path:
{
args
.
include_path
}
"
)
include_path
(
args
.
include_path
)
task_manager
=
TaskManager
(
args
.
verbosity
,
include_path
=
args
.
include_path
)
if
args
.
tasks
==
"all_tasks"
:
task_names
=
task
s
.
ALL_TASKS
task_names
=
task
_manager
.
all_tasks
else
:
task_names
=
args
.
tasks
.
split
(
","
)
task_dict
=
tasks
.
get_task_dict
(
task_names
)
task_dict
=
tasks
.
get_task_dict
(
task_names
,
task_manager
)
os
.
makedirs
(
args
.
output_base_path
,
exist_ok
=
True
)
for
task_name
,
task
in
task_dict
.
items
():
if
typ
e
(
task
)
==
tuple
:
group_name
,
task
=
task
if
isinstanc
e
(
task
,
tuple
)
:
_
,
task
=
task
rnd
=
random
.
Random
()
rnd
.
seed
(
args
.
seed
)
...
...
scripts/zeno_visualize.py
View file @
f66fc06f
...
...
@@ -69,18 +69,20 @@ def main():
model_args
=
re
.
sub
(
"/|="
,
"__"
,
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"config"
][
"model_args"
],
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"config"
][
"model_args"
],
)
with
open
(
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
,
encoding
=
"utf-8"
,
)
as
file
:
data
=
json
.
loads
(
file
.
read
())
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"configs"
]
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"configs"
]
config
=
configs
[
task
]
if
model_index
==
0
:
# Only need to assemble data for the first model
...
...
@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model.
"""
dir_path
=
Path
(
data_path
,
model
)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
)))[
"configs"
],)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
),
encoding
=
"utf-8"
))[
"configs"
],
)
return
list
(
config
[
0
].
keys
())
...
...
tests/models/test_huggingface.py
View file @
f66fc06f
...
...
@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
from
lm_eval.models.huggingface
import
HFLM
task
s
.
initialize_tasks
()
task
_manager
=
tasks
.
TaskManager
()
class
Test_HFLM
:
torch
.
use_deterministic_algorithms
(
True
)
task_list
=
task_manager
.
load_task_or_group
([
"arc_easy"
,
"gsm8k"
,
"wikitext"
])
version_minor
=
sys
.
version_info
.
minor
multiple_choice_task
=
task
s
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
=
task
_list
[
"arc_easy"
]
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
task
s
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
=
task
_list
[
"gsm8k"
]
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
task
s
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
=
task
_list
[
"wikitext"
]
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
list
[
Instance
]
=
rolling_task
.
instances
...
...
tests/models/test_openvino.py
0 → 100644
View file @
f66fc06f
import
random
import
tempfile
import
pytest
from
optimum.intel
import
OVModelForCausalLM
from
transformers
import
AutoTokenizer
import
lm_eval.evaluator
as
evaluator
from
lm_eval.api.registry
import
get_model
SUPPORTED_ARCHITECTURES_TASKS
=
{
"facebook/opt-125m"
:
"lambada_openai"
,
"hf-internal-testing/tiny-random-gpt2"
:
"wikitext"
,
}
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SUPPORTED_ARCHITECTURES_TASKS
.
items
())
def
test_evaluator
(
model_id
,
task
):
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
model
=
OVModelForCausalLM
.
from_pretrained
(
model_id
,
export
=
True
,
use_cache
=
True
)
model
.
save_pretrained
(
tmpdirname
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
tokenizer
.
save_pretrained
(
tmpdirname
)
lm
=
get_model
(
"openvino"
).
create_from_arg_string
(
f
"pretrained=
{
tmpdirname
}
"
,
{
"batch_size"
:
1
,
"device"
:
"cpu"
,
},
)
def
ll_fn
(
reqs
):
for
ctx
,
cont
in
[
req
.
args
for
req
in
reqs
]:
if
len
(
ctx
)
==
0
:
continue
# space convention
assert
ctx
[
-
1
]
!=
" "
assert
cont
[
0
]
==
" "
or
ctx
[
-
1
]
==
"
\n
"
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
((
-
random
.
random
(),
False
))
return
res
def
ll_perp_fn
(
reqs
):
for
(
string
,)
in
[
req
.
args
for
req
in
reqs
]:
assert
isinstance
(
string
,
str
)
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
(
-
random
.
random
())
return
res
lm
.
loglikelihood
=
ll_fn
lm
.
loglikelihood_rolling
=
ll_perp_fn
limit
=
10
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
bootstrap_iters
=
10
,
)
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment