Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
90ad5db7
Commit
90ad5db7
authored
Mar 01, 2024
by
lintangsutawika
Browse files
merged main
parents
f692caa9
b177c82c
Changes
484
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
50 additions
and
112 deletions
+50
-112
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml
...asks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml
+1
-1
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
+4
-1
lm_eval/tasks/qasper/metrics.py
lm_eval/tasks/qasper/metrics.py
+0
-1
lm_eval/tasks/qasper/utils.py
lm_eval/tasks/qasper/utils.py
+2
-1
lm_eval/tasks/realtoxicityprompts/metric.py
lm_eval/tasks/realtoxicityprompts/metric.py
+3
-2
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+8
-7
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+6
-6
lm_eval/tasks/super_glue/cb/aggregate.py
lm_eval/tasks/super_glue/cb/aggregate.py
+1
-1
lm_eval/tasks/super_glue/record/t5_utils.py
lm_eval/tasks/super_glue/record/t5_utils.py
+2
-2
lm_eval/tasks/super_glue/wsc/t5_utils.py
lm_eval/tasks/super_glue/wsc/t5_utils.py
+6
-5
lm_eval/tasks/truthfulqa/utils.py
lm_eval/tasks/truthfulqa/utils.py
+1
-2
lm_eval/tasks/xwinograd/utils.py
lm_eval/tasks/xwinograd/utils.py
+3
-1
lm_eval/utils.py
lm_eval/utils.py
+3
-71
pyproject.toml
pyproject.toml
+4
-5
No files found.
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_te_mc2
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
te
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc1
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
uk
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc2
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
uk
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc1
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
vi
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc2
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
vi
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc1
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
zh
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml
View file @
90ad5db7
...
...
@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc2
dataset_path
:
alexandrainst/m_truthfulqa
dataset_name
:
zh
training_split
:
null
validation_split
:
val
idation
validation_split
:
val
test_split
:
null
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
View file @
90ad5db7
import
datasets
import
re
import
datasets
import
numpy
as
np
QA_PROMPT
=
(
"Q: What is human life expectancy in the United States?
\n
"
"A: Human life expectancy in the United States is 78 years.
\n\n
"
...
...
@@ -17,6 +19,7 @@ QA_PROMPT = (
"A: The 1992 Olympics were held in Barcelona, Spain."
)
def
preprocess
(
text
):
if
text
is
None
:
return
" "
...
...
lm_eval/tasks/qasper/metrics.py
View file @
90ad5db7
import
re
import
string
from
collections
import
Counter
...
...
lm_eval/tasks/qasper/utils.py
View file @
90ad5db7
from
datasets
import
Dataset
from
functools
import
partial
from
datasets
import
Dataset
def
process_docs
(
dataset
,
set_answer_type
=
"bool"
):
FEATURES
=
[
"title"
,
"abstract"
,
"question"
,
"answer"
,
"answer_type"
]
...
...
lm_eval/tasks/realtoxicityprompts/metric.py
View file @
90ad5db7
import
os
import
json
import
requests
import
os
import
numpy
as
np
import
requests
from
lm_eval.utils
import
eval_logger
...
...
lm_eval/tasks/scrolls/task.py
View file @
90ad5db7
import
re
from
abc
import
abstractmethod
from
functools
import
reduce
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
abc
import
abstractmethod
from
datasets
import
load_metric
from
transformers
import
AutoTokenizer
from
functools
import
reduce
from
lm_eval.api.task
import
Task
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.task
import
Task
_CITATION
=
"""
@inproceedings{shaham-etal-2022-scrolls,
...
...
@@ -44,6 +44,7 @@ _CITATION = """
def
_download_metric
():
import
os
import
shutil
from
huggingface_hub
import
hf_hub_download
scrolls_metric_path
=
hf_hub_download
(
...
...
@@ -148,7 +149,7 @@ class _SCROLLSTask(Task):
del
self
.
dataset
[
"test"
]
for
split
in
self
.
dataset
:
self
.
dataset
[
split
]
=
_drop_duplicates_in_input
(
self
.
dataset
[
split
])
if
self
.
PRUNE_TOKENIZERS
is
not
None
and
self
.
PRUNE_TOKENIZERS
is
not
None
:
if
self
.
PRUNE_TOKENIZERS
is
not
None
:
self
.
prune
()
def
_get_prune_text
(
self
,
sample
):
...
...
lm_eval/tasks/squadv2/task.py
View file @
90ad5db7
...
...
@@ -13,14 +13,15 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import
datasets
from
math
import
exp
from
functools
import
partial
from
math
import
exp
import
datasets
from
packaging
import
version
from
lm_eval.api.task
import
ConfigurableTask
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.task
import
ConfigurableTask
_CITATION
=
"""
@misc{rajpurkar2018know,
...
...
@@ -35,7 +36,6 @@ _CITATION = """
def
_squad_metric
(
predictions
,
references
):
# squad_metric = load("squad_v2")
squad_metric
=
datasets
.
load_metric
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
...
...
@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask):
DATASET_NAME
=
None
def
__init__
(
self
):
super
().
__init__
(
config
=
{
'
metadata
'
:
{
'
version
'
:
self
.
VERSION
}})
super
().
__init__
(
config
=
{
"
metadata
"
:
{
"
version
"
:
self
.
VERSION
}})
# HF changed squad on us so we have to make sure we aren't running the old one
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
...
...
lm_eval/tasks/super_glue/cb/aggregate.py
View file @
90ad5db7
import
sklearn
import
numpy
as
np
import
sklearn
def
cb_multi_fi
(
items
):
...
...
lm_eval/tasks/super_glue/record/t5_utils.py
View file @
90ad5db7
import
collections
import
re
import
string
import
collections
import
numpy
as
np
import
numpy
as
np
from
datasets
import
Dataset
from
lm_eval.api.metrics
import
metric_max_over_ground_truths
...
...
lm_eval/tasks/super_glue/wsc/t5_utils.py
View file @
90ad5db7
import
re
from
typing
import
List
def
doc_to_text
(
x
):
text
=
re
.
sub
(
r
" X "
,
" *"
+
x
[
"span2_text"
]
+
"* "
,
_wsc_inputs
(
x
))
return
"wsc: "
+
text
...
...
@@ -23,7 +24,7 @@ def _wsc_inputs(x):
[
" "
.
join
(
words
[:
pronoun_index
]),
"X"
,
" "
.
join
(
words
[
pronoun_index
+
1
:]),
" "
.
join
(
words
[
pronoun_index
+
1
:]),
]
)
...
...
lm_eval/tasks/truthfulqa/utils.py
View file @
90ad5db7
import
datasets
import
sacrebleu
import
numpy
as
np
import
sacrebleu
from
rouge_score
import
rouge_scorer
,
scoring
...
...
lm_eval/tasks/xwinograd/utils.py
View file @
90ad5db7
...
...
@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for
lang
in
LANGUAGES
:
file_name
=
f
"xwinograd_
{
lang
}
.yaml"
try
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by utils.py
\n
"
)
yaml
.
dump
(
{
...
...
lm_eval/utils.py
View file @
90ad5db7
...
...
@@ -5,16 +5,9 @@ import importlib.util
import
inspect
import
logging
import
os
import
pathlib
import
re
import
subprocess
import
sys
from
itertools
import
islice
from
typing
import
(
Any
,
Callable
,
List
,
)
from
typing
import
Any
,
Callable
,
List
import
numpy
as
np
import
yaml
...
...
@@ -249,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
values
=
[]
for
k
,
dic
in
result_dict
[
column
].
items
():
version
=
result_dict
[
"versions"
]
[
k
]
version
=
result_dict
[
"versions"
]
.
get
(
k
,
"N/A"
)
n
=
str
(
result_dict
[
"n-shot"
][
k
])
if
"alias"
in
dic
:
...
...
@@ -297,61 +290,6 @@ def positional_deprecated(fn):
return
_wrapper
@
positional_deprecated
def
find_test_root
(
start_path
:
pathlib
.
Path
)
->
pathlib
.
Path
:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path
=
start_path
.
resolve
()
max_layers
=
3
for
_
in
range
(
max_layers
):
if
(
cur_path
/
"tests"
/
"test_version_stable.py"
).
exists
():
return
cur_path
else
:
cur_path
=
cur_path
.
parent
.
resolve
()
raise
FileNotFoundError
(
f
"Unable to find package root within
{
max_layers
}
upwards"
+
f
"of
{
start_path
}
"
)
@
positional_deprecated
def
run_task_tests
(
task_list
:
List
[
str
]):
"""
Find the package root and run the tests for the given tasks
"""
import
pytest
package_root
=
find_test_root
(
start_path
=
pathlib
.
Path
(
__file__
))
task_string
=
" or "
.
join
(
task_list
)
args
=
[
f
"
{
package_root
}
/tests/test_version_stable.py"
,
f
"--rootdir=
{
package_root
}
"
,
"-k"
,
f
"
{
task_string
}
"
,
]
sys
.
path
.
append
(
str
(
package_root
))
pytest_return_val
=
pytest
.
main
(
args
)
if
pytest_return_val
:
raise
ValueError
(
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
)
def
get_git_commit_hash
():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try
:
git_hash
=
subprocess
.
check_output
([
"git"
,
"describe"
,
"--always"
]).
strip
()
git_hash
=
git_hash
.
decode
()
except
subprocess
.
CalledProcessError
or
FileNotFoundError
:
# FileNotFoundError occurs when git not installed on system
git_hash
=
None
return
git_hash
def
ignore_constructor
(
loader
,
node
):
return
node
...
...
@@ -433,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
return
rtemplate
.
render
(
**
doc
)
def
create_iterator
(
raw_iterator
,
rank
,
world_size
,
limit
=
None
):
def
create_iterator
(
raw_iterator
,
*
,
rank
=
0
,
world_size
=
1
,
limit
=
None
):
"""
Method for creating a (potentially) sliced and limited
iterator from a raw document iterator. Used for splitting data
among ranks in multigpu setting or only pulling a sample of documents
"""
return
islice
(
raw_iterator
,
rank
,
limit
,
world_size
)
# Multi-token stopping criteria
# from more_itertools
pyproject.toml
View file @
90ad5db7
...
...
@@ -36,6 +36,7 @@ dependencies = [
"tqdm-multiprocess"
,
"transformers>=4.1"
,
"zstandard"
,
"dill"
,
"word2number"
,
]
...
...
@@ -71,6 +72,7 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
vllm
=
["vllm<=0.2.5"]
zeno
=
[
"pandas"
,
"zeno-client"
]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
all
=
[
"lm_eval[anthropic]"
,
"lm_eval[dev]"
,
...
...
@@ -86,11 +88,9 @@ all = [
"lm_eval[testing]"
,
"lm_eval[vllm]"
,
"lm_eval[zeno]"
,
"lm_eval[wandb]"
,
]
[tool.ruff]
extend-exclude
=
["lm_eval/tasks/*.py"]
[tool.ruff.lint]
extend-select
=
["I"]
...
...
@@ -99,5 +99,4 @@ lines-after-imports = 2
known-first-party
=
["lm_eval"]
[tool.ruff.extend-per-file-ignores]
"__init__.py"
=
["F401","F402","F403","I"]
"lm_eval/tasks/*"
=
["E721"]
"__init__.py"
=
["F401","F402","F403"]
Prev
1
…
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment