Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cb8889cc
Commit
cb8889cc
authored
Feb 05, 2024
by
lintangsutawika
Browse files
merged with latest update from main
parents
ec05e561
74119471
Changes
69
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
112 additions
and
118 deletions
+112
-118
lm_eval/tasks/gsm8k/gsm8k-cot.yaml
lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+1
-0
lm_eval/tasks/gsm8k/gsm8k.yaml
lm_eval/tasks/gsm8k/gsm8k.yaml
+1
-0
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+1
-0
lm_eval/tasks/mmlu/_generate_configs.py
lm_eval/tasks/mmlu/_generate_configs.py
+4
-4
lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
...model_written_evals/advanced_ai_risk/_generate_configs.py
+1
-1
lm_eval/tasks/model_written_evals/persona/_generate_configs.py
...al/tasks/model_written_evals/persona/_generate_configs.py
+1
-1
lm_eval/tasks/qasper/utils.py
lm_eval/tasks/qasper/utils.py
+1
-1
lm_eval/tasks/scrolls/scrolls.yaml
lm_eval/tasks/scrolls/scrolls.yaml
+12
-12
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+7
-11
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+5
-2
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+2
-5
lm_eval/tasks/super_glue/wsc/t5_utils.py
lm_eval/tasks/super_glue/wsc/t5_utils.py
+56
-60
lm_eval/tasks/xwinograd/utils.py
lm_eval/tasks/xwinograd/utils.py
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+5
-5
pyproject.toml
pyproject.toml
+4
-5
scripts/build_benchmark.py
scripts/build_benchmark.py
+2
-2
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+2
-2
scripts/clean_training_data/investigate_pile.py
scripts/clean_training_data/investigate_pile.py
+2
-2
scripts/make_table_results.py
scripts/make_table_results.py
+3
-3
scripts/make_table_tasks.py
scripts/make_table_tasks.py
+1
-1
No files found.
lm_eval/tasks/gsm8k/gsm8k-cot.yaml
View file @
cb8889cc
...
@@ -41,3 +41,4 @@ filter_list:
...
@@ -41,3 +41,4 @@ filter_list:
-
function
:
"
take_first"
-
function
:
"
take_first"
metadata
:
metadata
:
version
:
2.0
version
:
2.0
num_fewshot
:
8
lm_eval/tasks/gsm8k/gsm8k.yaml
View file @
cb8889cc
...
@@ -24,6 +24,7 @@ generation_kwargs:
...
@@ -24,6 +24,7 @@ generation_kwargs:
-
"
\n\n
"
-
"
\n\n
"
-
"
Question:"
-
"
Question:"
do_sample
:
false
do_sample
:
false
temperature
:
0.0
repeats
:
1
repeats
:
1
num_fewshot
:
5
num_fewshot
:
5
filter_list
:
filter_list
:
...
...
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
View file @
cb8889cc
...
@@ -22,3 +22,4 @@ metric_list:
...
@@ -22,3 +22,4 @@ metric_list:
num_fewshot
:
0
num_fewshot
:
0
metadata
:
metadata
:
version
:
1.0
version
:
1.0
num_fewshot
:
4
lm_eval/tasks/mmlu/_generate_configs.py
View file @
cb8889cc
...
@@ -85,13 +85,13 @@ if __name__ == "__main__":
...
@@ -85,13 +85,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
if
args
.
cot_prompt_path
is
not
None
:
import
json
import
json
with
open
(
args
.
cot_prompt_path
)
as
f
:
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
cot_file
=
json
.
load
(
f
)
ALL_CATEGORIES
=
[]
ALL_CATEGORIES
=
[]
...
@@ -120,7 +120,7 @@ if __name__ == "__main__":
...
@@ -120,7 +120,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml
.
dump
(
yaml_dict
,
yaml_dict
,
yaml_file
,
yaml_file
,
...
@@ -142,7 +142,7 @@ if __name__ == "__main__":
...
@@ -142,7 +142,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
".yaml"
file_save_path
=
args
.
save_prefix_path
+
".yaml"
eval_logger
.
info
(
f
"Saving benchmark config to
{
file_save_path
}
"
)
eval_logger
.
info
(
f
"Saving benchmark config to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml
.
dump
(
{
{
"group"
:
f
"mmlu_
{
args
.
task_prefix
}
"
"group"
:
f
"mmlu_
{
args
.
task_prefix
}
"
...
...
lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
View file @
cb8889cc
...
@@ -9,7 +9,7 @@ def main() -> None:
...
@@ -9,7 +9,7 @@ def main() -> None:
for
task
in
tqdm
(
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()):
for
task
in
tqdm
(
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()):
file_name
=
f
"
{
task
}
.yaml"
file_name
=
f
"
{
task
}
.yaml"
try
:
try
:
with
open
(
f
"
{
file_name
}
"
,
"w"
)
as
f
:
with
open
(
f
"
{
file_name
}
"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by _generate_configs.py
\n
"
)
f
.
write
(
"# Generated by _generate_configs.py
\n
"
)
yaml
.
dump
(
yaml
.
dump
(
{
{
...
...
lm_eval/tasks/model_written_evals/persona/_generate_configs.py
View file @
cb8889cc
...
@@ -9,7 +9,7 @@ def main() -> None:
...
@@ -9,7 +9,7 @@ def main() -> None:
for
task
in
tqdm
(
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()):
for
task
in
tqdm
(
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()):
file_name
=
f
"
{
task
}
.yaml"
file_name
=
f
"
{
task
}
.yaml"
try
:
try
:
with
open
(
f
"
{
file_name
}
"
,
"w"
)
as
f
:
with
open
(
f
"
{
file_name
}
"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by _generate_configs.py
\n
"
)
f
.
write
(
"# Generated by _generate_configs.py
\n
"
)
yaml
.
dump
(
yaml
.
dump
(
{
{
...
...
lm_eval/tasks/qasper/utils.py
View file @
cb8889cc
...
@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
...
@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
obs_list
[
"abstract"
].
append
(
abstract
)
obs_list
[
"abstract"
].
append
(
abstract
)
obs_list
[
"question"
].
append
(
question
)
obs_list
[
"question"
].
append
(
question
)
obs_list
[
"answer_type"
].
append
(
answer_type
)
obs_list
[
"answer_type"
].
append
(
answer_type
)
if
typ
e
(
answer
)
==
list
:
if
isinstanc
e
(
answer
,
list
)
:
answer
=
", "
.
join
(
answer
)
answer
=
", "
.
join
(
answer
)
obs_list
[
"answer"
].
append
(
answer
)
obs_list
[
"answer"
].
append
(
answer
)
...
...
lm_eval/tasks/scrolls/scrolls.yaml
View file @
cb8889cc
group
:
scrolls
group
:
scrolls
task
:
task
:
#
- task: scrolls_qasper
-
task
:
scrolls_qasper
#
class: !function task.Qasper
class
:
!function
task.Qasper
-
task
:
scrolls_quality
-
task
:
scrolls_quality
class
:
!function
task.QuALITY
class
:
!function
task.QuALITY
# -
scrolls_narrativeqa
-
task
:
scrolls_narrativeqa
#
class: !function task.NarrativeQA
class
:
!function
task.NarrativeQA
# -
scrolls_contractnli
-
task
:
scrolls_contractnli
#
class: !function task.ContractNLI
class
:
!function
task.ContractNLI
# -
scrolls_govreport
-
task
:
scrolls_govreport
#
class: !function task.GovReport
class
:
!function
task.GovReport
# -
scrolls_summscreenfd
-
task
:
scrolls_summscreenfd
#
class: !function task.SummScreenFD
class
:
!function
task.SummScreenFD
# -
scrolls_qmsum
-
task
:
scrolls_qmsum
#
class: !function task.QMSum
class
:
!function
task.QMSum
lm_eval/tasks/scrolls/task.py
View file @
cb8889cc
...
@@ -115,7 +115,9 @@ class _SCROLLSTask(Task):
...
@@ -115,7 +115,9 @@ class _SCROLLSTask(Task):
PRUNE_MAX_TOKENS
=
None
PRUNE_MAX_TOKENS
=
None
PRUNE_NUM_PROC
=
None
PRUNE_NUM_PROC
=
None
def
__post_init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
if
self
.
DATASET_NAME
is
not
None
:
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
...
@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
...
@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"gold"
]
gold
=
doc
[
"gold"
]
acc
=
1.0
if
np
.
argmax
(
results
)
==
gold
else
0.0
lls
,
_
=
zip
(
*
results
)
acc
=
1.0
if
np
.
argmax
(
lls
)
==
gold
else
0.0
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.0
if
np
.
argmax
(
result
s
/
completion_len
)
==
gold
else
0.0
acc_norm
=
1.0
if
np
.
argmax
(
ll
s
/
completion_len
)
==
gold
else
0.0
return
{
return
{
"acc"
:
acc
,
"acc"
:
acc
,
...
@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
...
@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
return
f
"
{
doc
[
'input'
]
}
\n\n
Question: What is a summary of the preceding text?
\n
Answer:"
return
f
"
{
doc
[
'input'
]
}
\n\n
Question: What is a summary of the preceding text?
\n
Answer:"
# @register_task("scrolls_qasper")
class
Qasper
(
_SCROLLSTask
):
class
Qasper
(
_SCROLLSTask
):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
https://arxiv.org/abs/2105.03011
...
@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
...
@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
)
)
# @register_task("scrolls_quality")
class
QuALITY
(
_SCROLLSMultipleChoiceTask
):
class
QuALITY
(
_SCROLLSMultipleChoiceTask
):
"""QuALITY: Question Answering with Long Input Texts, Yes!
"""QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608
https://arxiv.org/abs/2112.08608
...
@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
...
@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
return
[
doc
]
return
[
doc
]
# @register_task("scrolls_narrativeqa")
class
NarrativeQA
(
_SCROLLSTask
):
class
NarrativeQA
(
_SCROLLSTask
):
"""The NarrativeQA Reading Comprehension Challenge
"""The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040
https://arxiv.org/abs/1712.07040
...
@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
...
@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
)
)
# @register_task("scrolls_contractnli")
class
ContractNLI
(
_SCROLLSMultipleChoiceTask
):
class
ContractNLI
(
_SCROLLSMultipleChoiceTask
):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040
https://arxiv.org/abs/1712.07040
...
@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
...
@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
return
f
"
{
doc
[
'text'
]
}
\n\n
Hypothesis:
{
doc
[
'question'
]
}
\n
Conclusion:"
return
f
"
{
doc
[
'text'
]
}
\n\n
Hypothesis:
{
doc
[
'question'
]
}
\n
Conclusion:"
# @register_task("scrolls_govreport")
class
GovReport
(
_SCROLLSSummaryTask
):
class
GovReport
(
_SCROLLSSummaryTask
):
"""Efficient Attentions for Long Document Summarization
"""Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112
https://arxiv.org/abs/2104.02112
...
@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
...
@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
DATASET_NAME
=
"gov_report"
DATASET_NAME
=
"gov_report"
# @register_task("scrolls_summscreenfd")
class
SummScreenFD
(
_SCROLLSSummaryTask
):
class
SummScreenFD
(
_SCROLLSSummaryTask
):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091
https://arxiv.org/abs/2104.07091
...
@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
...
@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
DATASET_NAME
=
"summ_screen_fd"
DATASET_NAME
=
"summ_screen_fd"
# @register_task("scrolls_qmsum")
class
QMSum
(
_SCROLLSSummaryTask
):
class
QMSum
(
_SCROLLSSummaryTask
):
"""QMSum: A New Benchmark for Query-based Multi-domain
"""QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization
Meeting Summarization
...
...
lm_eval/tasks/squadv2/task.py
View file @
cb8889cc
...
@@ -19,7 +19,7 @@ from math import exp
...
@@ -19,7 +19,7 @@ from math import exp
from
functools
import
partial
from
functools
import
partial
from
packaging
import
version
from
packaging
import
version
from
lm_eval.api.task
import
Task
from
lm_eval.api.task
import
Configurable
Task
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.instance
import
Instance
_CITATION
=
"""
_CITATION
=
"""
...
@@ -46,11 +46,14 @@ def _squad_agg(key, items):
...
@@ -46,11 +46,14 @@ def _squad_agg(key, items):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
key
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
key
,
0
)
class
SQuAD2
(
Task
):
class
SQuAD2
(
Configurable
Task
):
VERSION
=
3
VERSION
=
3
DATASET_PATH
=
"squad_v2"
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
DATASET_NAME
=
None
def
__init__
(
self
):
super
().
__init__
(
config
=
{
'metadata'
:
{
'version'
:
self
.
VERSION
}})
# HF changed squad on us so we have to make sure we aren't running the old one
# HF changed squad on us so we have to make sure we aren't running the old one
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
"1.11.0"
"1.11.0"
...
...
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
View file @
cb8889cc
...
@@ -7,6 +7,7 @@ training_split: train
...
@@ -7,6 +7,7 @@ training_split: train
validation_split
:
validation
validation_split
:
validation
output_type
:
generate_until
output_type
:
generate_until
doc_to_text
:
!function
"
t5_utils.doc_to_text"
doc_to_text
:
!function
"
t5_utils.doc_to_text"
process_results
:
!function
"
t5_utils.process_results"
doc_to_target
:
label
doc_to_target
:
label
generation_kwargs
:
generation_kwargs
:
until
:
until
:
...
@@ -15,9 +16,5 @@ metric_list:
...
@@ -15,9 +16,5 @@ metric_list:
-
metric
:
accuracy
-
metric
:
accuracy
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
filter_list
:
-
name
:
"
wsc_postprocessor"
filter
:
-
function
:
!function
t5_utils.WSCPostprocess
metadata
:
metadata
:
version
:
0
.0
version
:
1
.0
lm_eval/tasks/super_glue/wsc/t5_utils.py
View file @
cb8889cc
import
re
import
re
from
lm_eval.api.filter
import
Filter
from
typing
import
List
def
doc_to_text
(
x
):
def
doc_to_text
(
x
):
text
=
re
.
sub
(
r
" X "
,
" *"
+
x
[
"span2_text"
]
+
"* "
,
_wsc_inputs
(
x
))
text
=
re
.
sub
(
r
" X "
,
" *"
+
x
[
"span2_text"
]
+
"* "
,
_wsc_inputs
(
x
))
...
@@ -24,7 +23,7 @@ def _wsc_inputs(x):
...
@@ -24,7 +23,7 @@ def _wsc_inputs(x):
[
[
" "
.
join
(
words
[:
pronoun_index
]),
" "
.
join
(
words
[:
pronoun_index
]),
"X"
,
"X"
,
" "
.
join
(
words
[
pronoun_index
+
1
:]),
" "
.
join
(
words
[
pronoun_index
+
1
:]),
]
]
)
)
...
@@ -52,9 +51,7 @@ def _wsc_inputs(x):
...
@@ -52,9 +51,7 @@ def _wsc_inputs(x):
return
create_input
()
return
create_input
()
class
WSCPostprocess
(
Filter
):
DETERMINERS
=
{
def
__init__
(
self
,
**
kwargs
):
self
.
determiners
=
{
"a"
,
"a"
,
"an"
,
"an"
,
"few"
,
"few"
,
...
@@ -76,18 +73,18 @@ class WSCPostprocess(Filter):
...
@@ -76,18 +73,18 @@ class WSCPostprocess(Filter):
"which"
,
"which"
,
"whose"
,
"whose"
,
"your"
,
"your"
,
}
}
def
clean
(
s
elf
,
s
)
:
def
clean
(
s
:
str
)
->
str
:
"""Ignore capitalization and determiners."""
"""Ignore capitalization and determiners."""
s
=
s
.
strip
().
lower
()
s
=
s
.
strip
().
lower
()
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
self
.
determiners
])
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
DETERMINERS
])
def
apply
(
self
,
resps
,
docs
):
filtered_resps
=
[]
def
process_results
(
docs
:
dict
,
resps
:
List
):
for
prediction
,
reference
in
zip
(
*
(
resps
,
docs
[
"span1_text"
])):
prediction
=
clean
(
resps
[
0
])
prediction
=
self
.
clean
(
prediction
[
0
])
reference
=
clean
(
docs
[
"span1_text"
])
reference
=
self
.
clean
(
reference
)
if
(
"'"
in
prediction
)
!=
(
"'"
in
reference
):
if
(
"'"
in
prediction
)
!=
(
"'"
in
reference
):
# referent is "Bob's hat" as predicting the referent.
# referent is "Bob's hat" as predicting the referent.
...
@@ -102,6 +99,5 @@ class WSCPostprocess(Filter):
...
@@ -102,6 +99,5 @@ class WSCPostprocess(Filter):
referent_words
referent_words
)
or
referent_words
.
issubset
(
prediction_words
)
)
or
referent_words
.
issubset
(
prediction_words
)
filtered_resps
.
append
(
predicted_referent
)
acc
=
1.0
if
predicted_referent
==
docs
[
"label"
]
else
0.0
return
{
"accuracy"
:
acc
}
return
filtered_resps
lm_eval/tasks/xwinograd/utils.py
View file @
cb8889cc
...
@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
...
@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for
lang
in
LANGUAGES
:
for
lang
in
LANGUAGES
:
file_name
=
f
"xwinograd_
{
lang
}
.yaml"
file_name
=
f
"xwinograd_
{
lang
}
.yaml"
try
:
try
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
)
as
f
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by utils.py
\n
"
)
f
.
write
(
"# Generated by utils.py
\n
"
)
yaml
.
dump
(
yaml
.
dump
(
{
{
...
...
lm_eval/utils.py
View file @
cb8889cc
...
@@ -501,14 +501,14 @@ def import_function(loader, node):
...
@@ -501,14 +501,14 @@ def import_function(loader, node):
return
function
return
function
def
load_yaml_config
(
mode
=
"simple"
,
yaml_path
=
None
,
yaml_config
=
None
,
yaml_dir
=
None
):
def
load_yaml_config
(
yaml_path
=
None
,
yaml_config
=
None
,
yaml_dir
=
None
,
mode
=
"full"
):
if
mode
==
"simple"
:
if
mode
==
"simple"
:
constuctor_fn
=
ignore_constructor
const
r
uctor_fn
=
ignore_constructor
elif
mode
==
"full"
:
elif
mode
==
"full"
:
constuctor_fn
=
import_function
const
r
uctor_fn
=
import_function
# Add the import_function constructor to the YAML loader
# Add the import_function constructor to the YAML loader
yaml
.
add_constructor
(
"!function"
,
constuctor_fn
)
yaml
.
add_constructor
(
"!function"
,
const
r
uctor_fn
)
if
yaml_config
is
None
:
if
yaml_config
is
None
:
with
open
(
yaml_path
,
"rb"
)
as
file
:
with
open
(
yaml_path
,
"rb"
)
as
file
:
yaml_config
=
yaml
.
full_load
(
file
)
yaml_config
=
yaml
.
full_load
(
file
)
...
@@ -536,7 +536,7 @@ def load_yaml_config(mode="simple", yaml_path=None, yaml_config=None, yaml_dir=N
...
@@ -536,7 +536,7 @@ def load_yaml_config(mode="simple", yaml_path=None, yaml_config=None, yaml_dir=N
path
=
os
.
path
.
join
(
yaml_dir
,
path
)
path
=
os
.
path
.
join
(
yaml_dir
,
path
)
try
:
try
:
included_yaml_config
=
load_yaml_config
(
mode
=
mode
,
yaml_path
=
path
)
included_yaml_config
=
load_yaml_config
(
yaml_path
=
path
,
mode
=
mode
)
final_yaml_config
.
update
(
included_yaml_config
)
final_yaml_config
.
update
(
included_yaml_config
)
except
Exception
as
ex
:
except
Exception
as
ex
:
# If failed to load, ignore
# If failed to load, ignore
...
...
pyproject.toml
View file @
cb8889cc
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
[project]
name
=
"lm_eval"
name
=
"lm_eval"
version
=
"0.4.
0
"
version
=
"0.4.
1
"
authors
=
[
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
]
...
@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
...
@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
[project.optional-dependencies]
anthropic
=
["anthropic"]
anthropic
=
["anthropic"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
gptq
=
["auto-gptq[triton]
>=0.6.0
"]
ifeval
=
[
"langdetect"
,
"immutabledict"
]
ifeval
=
[
"langdetect"
,
"immutabledict"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
openai
=
[
"openai==1.3.9"
,
"tiktoken"
]
openai
=
[
"openai==1.3.9"
,
"tiktoken"
]
promptsource
=
[
optimum
=
["optimum[openvino]"]
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
promptsource
=
["promptsource>=0.2.3"]
]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
vllm
=
["vllm<=0.2.5"]
vllm
=
["vllm<=0.2.5"]
...
...
scripts/build_benchmark.py
View file @
cb8889cc
...
@@ -23,7 +23,7 @@ def parse_args():
...
@@ -23,7 +23,7 @@ def parse_args():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
args
=
parse_args
()
args
=
parse_args
()
with
open
(
args
.
benchmark_path
)
as
file
:
with
open
(
args
.
benchmark_path
,
encoding
=
"utf-8"
)
as
file
:
TASK_LIST
=
yaml
.
full_load
(
file
)
TASK_LIST
=
yaml
.
full_load
(
file
)
for
task
in
tqdm
(
TASK_LIST
):
for
task
in
tqdm
(
TASK_LIST
):
eval_logger
.
info
(
f
"Processing
{
task
}
"
)
eval_logger
.
info
(
f
"Processing
{
task
}
"
)
...
@@ -57,5 +57,5 @@ if __name__ == "__main__":
...
@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path
=
os
.
path
.
join
(
file_path
,
full_file_name
)
file_save_path
=
os
.
path
.
join
(
file_path
,
full_file_name
)
eval_logger
.
info
(
f
"Save to
{
file_save_path
}
"
)
eval_logger
.
info
(
f
"Save to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
config_dict
,
yaml_file
)
yaml
.
dump
(
config_dict
,
yaml_file
)
scripts/clean_training_data/generate_13_grams.py
View file @
cb8889cc
...
@@ -119,7 +119,7 @@ class Buckets:
...
@@ -119,7 +119,7 @@ class Buckets:
def
do_ngrams_in_buckets
(
n_value
,
working_directory
,
bucket_count
):
def
do_ngrams_in_buckets
(
n_value
,
working_directory
,
bucket_count
):
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
))
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
,
encoding
=
"utf-8"
))
pile_document_count
=
pile_statistics
[
"Document Count"
]
pile_document_count
=
pile_statistics
[
"Document Count"
]
start_offsets
=
pile_statistics
[
"File Start Offsets"
]
start_offsets
=
pile_statistics
[
"File Start Offsets"
]
...
@@ -212,4 +212,4 @@ if __name__ == "__main__":
...
@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict
=
{
"title"
:
"dataset ngrams"
,
"ngram_size"
:
13
}
info_dict
=
{
"title"
:
"dataset ngrams"
,
"ngram_size"
:
13
}
info_dict_path
=
os
.
path
.
join
(
args
.
working_directory
,
"info.json"
)
info_dict_path
=
os
.
path
.
join
(
args
.
working_directory
,
"info.json"
)
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
))
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
,
encoding
=
"utf-8"
))
scripts/clean_training_data/investigate_pile.py
View file @
cb8889cc
...
@@ -79,7 +79,7 @@ if __name__ == "__main__":
...
@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path
=
"pile_statistics.json"
stats_file_path
=
"pile_statistics.json"
if
os
.
path
.
exists
(
stats_file_path
):
if
os
.
path
.
exists
(
stats_file_path
):
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
))
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
,
encoding
=
"utf-8"
))
else
:
else
:
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
stats
=
{
stats
=
{
...
@@ -88,7 +88,7 @@ if __name__ == "__main__":
...
@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters"
:
total_document_size_chars
,
"Total Pile Characters"
:
total_document_size_chars
,
"File Start Offsets"
:
start_offsets
,
"File Start Offsets"
:
start_offsets
,
}
}
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
),
indent
=
4
)
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
,
encoding
=
"utf-8"
),
indent
=
4
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
...
...
scripts/make_table_results.py
View file @
cb8889cc
...
@@ -61,14 +61,14 @@ if __name__ == "__main__":
...
@@ -61,14 +61,14 @@ if __name__ == "__main__":
if
not
filenames
:
if
not
filenames
:
continue
continue
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
with
open
(
path_readme
,
"w"
)
as
f
:
with
open
(
path_readme
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
# get path name, only last folder
# get path name, only last folder
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
path
=
os
.
path
.
join
(
dirpath
,
filename
)
path
=
os
.
path
.
join
(
dirpath
,
filename
)
with
open
(
path
,
"r"
)
as
f
:
with
open
(
path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
result_dict
=
json
.
load
(
f
)
result_dict
=
json
.
load
(
f
)
with
open
(
path_readme
,
"a"
)
as
f
:
with
open
(
path_readme
,
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
scripts/make_table_tasks.py
View file @
cb8889cc
...
@@ -50,5 +50,5 @@ if __name__ == "__main__":
...
@@ -50,5 +50,5 @@ if __name__ == "__main__":
values
.
append
(
v
)
values
.
append
(
v
)
writer
.
value_matrix
=
values
writer
.
value_matrix
=
values
table
=
writer
.
dumps
()
table
=
writer
.
dumps
()
with
open
(
args
.
output
,
"w"
)
as
f
:
with
open
(
args
.
output
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
table
)
f
.
write
(
table
)
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment