Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a58674ae
Commit
a58674ae
authored
Nov 17, 2023
by
lintangsutawika
Browse files
merge cnflict resolved
parents
10cc0a56
f76941ef
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
843 additions
and
126 deletions
+843
-126
lm_eval/api/task.py
lm_eval/api/task.py
+81
-32
lm_eval/evaluator.py
lm_eval/evaluator.py
+8
-8
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+11
-0
lm_eval/tasks/scrolls/README.md
lm_eval/tasks/scrolls/README.md
+31
-0
lm_eval/tasks/scrolls/scrolls.yaml
lm_eval/tasks/scrolls/scrolls.yaml
+9
-0
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+460
-0
lm_eval/tasks/squadv2/README.md
lm_eval/tasks/squadv2/README.md
+1
-2
lm_eval/tasks/squadv2/_template_yaml
lm_eval/tasks/squadv2/_template_yaml
+0
-8
lm_eval/tasks/squadv2/default.yaml
lm_eval/tasks/squadv2/default.yaml
+0
-13
lm_eval/tasks/squadv2/no_ans.yaml
lm_eval/tasks/squadv2/no_ans.yaml
+0
-6
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+240
-0
lm_eval/tasks/squadv2/utils.py
lm_eval/tasks/squadv2/utils.py
+0
-51
lm_eval/tasks/squadv2/with_noans_prob.yaml
lm_eval/tasks/squadv2/with_noans_prob.yaml
+0
-4
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+2
-2
No files found.
lm_eval/api/task.py
View file @
a58674ae
...
@@ -94,7 +94,7 @@ class TaskConfig(dict):
...
@@ -94,7 +94,7 @@ class TaskConfig(dict):
metadata
:
str
=
None
# by default, not used in the code. allows for users to pass arbitrary info to tasks
metadata
:
str
=
None
# by default, not used in the code. allows for users to pass arbitrary info to tasks
def
__post_init__
(
self
)
->
None
:
def
__post_init__
(
self
)
->
None
:
if
"."
in
self
.
dataset_path
:
if
self
.
dataset_path
and
(
"."
in
self
.
dataset_path
)
:
import
inspect
import
inspect
from
importlib
import
import_module
from
importlib
import
import_module
...
@@ -207,19 +207,9 @@ class Task(abc.ABC):
...
@@ -207,19 +207,9 @@ class Task(abc.ABC):
self
.
_fewshot_docs
=
None
self
.
_fewshot_docs
=
None
self
.
_instances
=
None
self
.
_instances
=
None
self
.
_config
=
TaskConfig
(
**
config
)
if
config
else
TaskConfig
()
self
.
_config
=
TaskConfig
(
{
**
config
}
)
if
config
else
TaskConfig
()
if
not
hasattr
(
self
,
"_filters"
):
self
.
_filters
=
[
build_filter_ensemble
(
"none"
,
[[
"take_first"
,
None
]])]
self
.
_filters
=
[]
for
name
,
components
in
self
.
_config
.
get
(
"filters"
,
[[
"none"
,
[[
"take_first"
,
None
]]]]
):
filter_pipeline
=
build_filter_ensemble
(
name
,
components
)
self
.
_filters
.
append
(
filter_pipeline
)
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
(
1234
)
)
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
)
->
None
:
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
)
->
None
:
"""Downloads and returns the task dataset.
"""Downloads and returns the task dataset.
...
@@ -360,9 +350,7 @@ class Task(abc.ABC):
...
@@ -360,9 +350,7 @@ class Task(abc.ABC):
False
False
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
eval_logger
.
info
(
eval_logger
.
info
(
f
"Building contexts for task on rank
{
rank
}
..."
)
f
"Building contexts for task '
{
self
.
config
.
task
}
' on rank
{
rank
}
..."
)
instances
=
[]
instances
=
[]
for
doc_id
,
doc
in
utils
.
create_iterator
(
for
doc_id
,
doc
in
utils
.
create_iterator
(
...
@@ -452,7 +440,13 @@ class Task(abc.ABC):
...
@@ -452,7 +440,13 @@ class Task(abc.ABC):
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
@
utils
.
positional_deprecated
@
utils
.
positional_deprecated
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
=
random
.
Random
(
1234
),
description
=
None
,
):
"""Returns a fewshot context string that is made up of a prepended description
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
...
@@ -460,34 +454,56 @@ class Task(abc.ABC):
...
@@ -460,34 +454,56 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs.
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
The number of fewshot examples to provide in the returned context string.
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:param description: str
The task's description that will be prepended to the fewshot examples.
:returns: str
:returns: str
The fewshot context.
The fewshot context.
"""
"""
assert
(
rnd
is
not
None
),
"A `random.Random` generator argument must be provided to `rnd`"
description
=
description
if
description
else
""
if
num_fewshot
==
0
:
if
num_fewshot
==
0
:
# always prepend the (possibly empty) task description
labeled_examples
=
""
labeled_examples
=
self
.
config
.
description
else
:
else
:
labeled_examples
=
self
.
config
.
description
+
self
.
sampler
.
get_context
(
# for sets with no training docs, draw from other set *but ensure no overlap with current doc*
doc
,
num_fewshot
if
self
.
has_training_docs
():
fewshotex
=
self
.
fewshot_examples
(
k
=
num_fewshot
,
rnd
=
rnd
)
else
:
if
self
.
_fewshot_docs
is
None
:
self
.
_fewshot_docs
=
list
(
self
.
validation_docs
()
if
self
.
has_validation_docs
()
else
self
.
test_docs
()
)
fewshotex
=
rnd
.
sample
(
self
.
_fewshot_docs
,
num_fewshot
+
1
)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
fewshotex
=
[
x
for
x
in
fewshotex
if
x
!=
doc
][:
num_fewshot
]
labeled_examples
=
(
"
\n\n
"
.
join
(
[
self
.
doc_to_text
(
doc
)
+
self
.
doc_to_target
(
doc
)
for
doc
in
fewshotex
]
)
+
"
\n\n
"
)
)
example
=
self
.
doc_to_text
(
doc
)
example
=
self
.
doc_to_text
(
doc
)
if
type
(
example
)
==
str
:
return
description
+
labeled_examples
+
example
return
labeled_examples
+
example
elif
type
(
example
)
==
list
:
return
[
labeled_examples
+
ex
for
ex
in
example
]
elif
type
(
example
)
==
int
:
if
self
.
config
.
doc_to_choice
is
not
None
:
choices
=
self
.
doc_to_choice
(
doc
)
return
labeled_examples
+
choices
[
example
]
else
:
return
labeled_examples
+
str
(
example
)
def
apply_filters
(
self
):
def
apply_filters
(
self
):
if
hasattr
(
self
,
"_filters"
):
if
hasattr
(
self
,
"_filters"
):
for
f
in
self
.
_filters
:
for
f
in
self
.
_filters
:
f
.
apply
(
self
.
_instances
)
f
.
apply
(
self
.
_instances
,
None
)
else
:
else
:
eval_logger
.
warning
(
"No filter defined, passing through instances"
)
eval_logger
.
warning
(
"No filter defined, passing through instances"
)
return
self
.
_instances
return
self
.
_instances
...
@@ -767,6 +783,39 @@ class ConfigurableTask(Task):
...
@@ -767,6 +783,39 @@ class ConfigurableTask(Task):
)
)
return
super
().
fewshot_docs
()
return
super
().
fewshot_docs
()
@
utils
.
positional_deprecated
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
:param doc: str
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:returns: str
The fewshot context.
"""
if
num_fewshot
==
0
:
# always prepend the (possibly empty) task description
labeled_examples
=
self
.
config
.
description
else
:
labeled_examples
=
self
.
config
.
description
+
self
.
sampler
.
get_context
(
doc
,
num_fewshot
)
example
=
self
.
doc_to_text
(
doc
)
if
type
(
example
)
==
str
:
return
labeled_examples
+
example
elif
type
(
example
)
==
list
:
return
[
labeled_examples
+
ex
for
ex
in
example
]
elif
type
(
example
)
==
int
:
if
self
.
config
.
doc_to_choice
is
not
None
:
choices
=
self
.
doc_to_choice
(
doc
)
return
labeled_examples
+
choices
[
example
]
else
:
return
labeled_examples
+
str
(
example
)
def
apply_filters
(
self
):
def
apply_filters
(
self
):
if
hasattr
(
self
,
"_filters"
):
if
hasattr
(
self
,
"_filters"
):
for
f
in
self
.
_filters
:
for
f
in
self
.
_filters
:
...
...
lm_eval/evaluator.py
View file @
a58674ae
...
@@ -225,6 +225,7 @@ def evaluate(
...
@@ -225,6 +225,7 @@ def evaluate(
versions
[
group_name
]
=
"N/A"
versions
[
group_name
]
=
"N/A"
else
:
else
:
group_name
=
None
task_hierarchy
[
task_name
]
=
[]
task_hierarchy
[
task_name
]
=
[]
if
task
is
None
:
if
task
is
None
:
...
@@ -236,8 +237,10 @@ def evaluate(
...
@@ -236,8 +237,10 @@ def evaluate(
if
"task_alias"
in
configs
[
task_name
]:
if
"task_alias"
in
configs
[
task_name
]:
task_group_alias
[
task_name
]
=
configs
[
task_name
][
"task_alias"
]
task_group_alias
[
task_name
]
=
configs
[
task_name
][
"task_alias"
]
if
(
"group_alias"
in
configs
[
task_name
])
and
(
if
(
group_name
not
in
task_group_alias
(
"group_alias"
in
configs
[
task_name
])
and
(
group_name
not
in
task_group_alias
)
and
(
group_name
is
not
None
)
):
):
task_group_alias
[
group_name
]
=
configs
[
task_name
][
"group_alias"
]
task_group_alias
[
group_name
]
=
configs
[
task_name
][
"group_alias"
]
...
@@ -267,12 +270,9 @@ def evaluate(
...
@@ -267,12 +270,9 @@ def evaluate(
eval_logger
.
info
(
f
"Request:
{
str
(
inst
)
}
"
)
eval_logger
.
info
(
f
"Request:
{
str
(
inst
)
}
"
)
# aggregate Instances by LM method requested to get output.
# aggregate Instances by LM method requested to get output.
reqtype
=
(
for
instance
in
task
.
instances
:
"loglikelihood"
reqtype
=
instance
.
request_type
if
task
.
OUTPUT_TYPE
==
"multiple_choice"
requests
[
reqtype
].
append
(
instance
)
else
task
.
OUTPUT_TYPE
)
# TODO: this is hacky, fix in task.py
requests
[
reqtype
].
extend
(
task
.
instances
)
if
lm
.
world_size
>
1
:
if
lm
.
world_size
>
1
:
instances_rnk
=
torch
.
tensor
(
len
(
task
.
_instances
),
device
=
lm
.
device
)
instances_rnk
=
torch
.
tensor
(
len
(
task
.
_instances
),
device
=
lm
.
device
)
...
...
lm_eval/tasks/__init__.py
View file @
a58674ae
...
@@ -15,6 +15,17 @@ from lm_eval.api.registry import (
...
@@ -15,6 +15,17 @@ from lm_eval.api.registry import (
import
logging
import
logging
# import python tasks
from
.squadv2.task
import
SQuAD2
from
.scrolls.task
import
(
QuALITY
,
NarrativeQA
,
ContractNLI
,
GovReport
,
SummScreenFD
,
QMSum
,
)
eval_logger
=
utils
.
eval_logger
eval_logger
=
utils
.
eval_logger
...
...
lm_eval/tasks/scrolls/README.md
0 → 100644
View file @
a58674ae
"""
SCROLLS: Standardized CompaRison Over Long Language Sequences
https://arxiv.org/abs/2201.03533
SCROLLS is a suite of datasets that require synthesizing information over long texts.
The benchmark includes seven natural language tasks across multiple domains,
including summarization, question answering, and natural language inference.
Homepage: https://www.scrolls-benchmark.com/
Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
it is possible to create "subset" tasks that contain only those samples whose tokenized length
is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
```
class QasperGPTNeoX4K(Qasper):
PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
PRUNE_MAX_TOKENS = 4096
PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
```
`PRUNE_TOKENIZERS`
can contain more than one tokenizer; this will include only samples that are
less than
`PRUNE_MAX_TOKENS`
for ALL of the tokenizers. This can be useful to comparing models
that use different tokenizers but the same maximum sequence length.
Once the subset task class has been defined in this file, it can be used by adding the class
to
`lm_eval/tasks/__init__.py`
.
NOTE: GovReport may need
`max_gen_toks`
set larger for causal models.
"""
lm_eval/tasks/scrolls/scrolls.yaml
0 → 100644
View file @
a58674ae
group
:
scrolls
task
:
-
scrolls_qasper
-
scrolls_quality
-
scrolls_narrativeqa
-
scrolls_contractnli
-
scrolls_govreport
-
scrolls_summscreenfd
-
scrolls_qmsum
lm_eval/tasks/scrolls/task.py
0 → 100644
View file @
a58674ae
import
re
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
abc
import
abstractmethod
from
datasets
import
load_metric
from
transformers
import
AutoTokenizer
from
functools
import
reduce
from
lm_eval.api.task
import
Task
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
_CITATION
=
"""
@inproceedings{shaham-etal-2022-scrolls,
title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
author = "Shaham, Uri and
Segal, Elad and
Ivgi, Maor and
Efrat, Avia and
Yoran, Ori and
Haviv, Adi and
Gupta, Ankit and
Xiong, Wenhan and
Geva, Mor and
Berant, Jonathan and
Levy, Omer",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.823",
pages = "12007--12021"
}
"""
# SCROLLS is formualted as a sequence-to-sequence task.
# To allow for evaluation of causal models, we'll
# reformualte these with appropriate prompts
def
_download_metric
():
import
os
import
shutil
from
huggingface_hub
import
hf_hub_download
scrolls_metric_path
=
hf_hub_download
(
repo_id
=
"tau/scrolls"
,
repo_type
=
"dataset"
,
filename
=
"metrics/scrolls.py"
)
updated_scrolls_metric_path
=
(
os
.
path
.
dirname
(
scrolls_metric_path
)
+
os
.
path
.
basename
(
scrolls_metric_path
).
replace
(
"."
,
"_"
)
+
".py"
)
shutil
.
copy
(
scrolls_metric_path
,
updated_scrolls_metric_path
)
return
updated_scrolls_metric_path
def
_process_doc_prepended_question
(
doc
):
# "When a query is given in addition to the raw text (as
# in QMSum, Qasper, NarrativeQA, QuALITY, and ContractNLI),
# we prepend it to the text, using two newlines as a natural separator"
input
=
doc
[
"input"
]
split
=
input
.
find
(
"
\n\n
"
)
return
{
"id"
:
doc
[
"id"
],
"pid"
:
doc
[
"pid"
],
"input"
:
input
,
"outputs"
:
doc
[
"outputs"
],
"question"
:
input
[
0
:
split
],
"text"
:
input
[
split
+
2
:],
}
def
_drop_duplicates_in_input
(
untokenized_dataset
):
# from scrolls/evaluator/dataset_evaluator.py
indices_to_keep
=
[]
id_to_idx
=
{}
outputs
=
[]
for
i
,
(
id_
,
output
)
in
enumerate
(
zip
(
untokenized_dataset
[
"id"
],
untokenized_dataset
[
"output"
])
):
if
id_
in
id_to_idx
:
outputs
[
id_to_idx
[
id_
]].
append
(
output
)
continue
indices_to_keep
.
append
(
i
)
id_to_idx
[
id_
]
=
len
(
outputs
)
outputs
.
append
([
output
])
untokenized_dataset
=
untokenized_dataset
.
select
(
indices_to_keep
).
flatten_indices
()
untokenized_dataset
=
untokenized_dataset
.
remove_columns
(
"output"
)
untokenized_dataset
=
untokenized_dataset
.
add_column
(
"outputs"
,
outputs
)
return
untokenized_dataset
def
_num_cpu_cores
():
# https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
try
:
import
psutil
return
psutil
.
cpu_count
(
logical
=
False
)
except
ImportError
:
import
os
return
len
(
os
.
sched_getaffinity
(
0
))
class
_SCROLLSTask
(
Task
):
VERSION
=
0
DATASET_PATH
=
"tau/scrolls"
DATASET_NAME
=
None
PRUNE_TOKENIZERS
=
None
PRUNE_MAX_TOKENS
=
None
PRUNE_NUM_PROC
=
None
def
__post_init__
(
self
):
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
training_docs
(
self
):
for
doc
in
self
.
dataset
[
"train"
]:
yield
from
self
.
_process_doc
(
doc
)
def
validation_docs
(
self
):
for
doc
in
self
.
dataset
[
"validation"
]:
yield
from
self
.
_process_doc
(
doc
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"input"
]
def
download
(
self
,
*
args
,
**
kwargs
):
super
().
download
(
*
args
,
**
kwargs
)
del
self
.
dataset
[
"test"
]
for
split
in
self
.
dataset
:
self
.
dataset
[
split
]
=
_drop_duplicates_in_input
(
self
.
dataset
[
split
])
if
self
.
PRUNE_TOKENIZERS
is
not
None
and
self
.
PRUNE_TOKENIZERS
is
not
None
:
self
.
prune
()
def
_get_prune_text
(
self
,
sample
):
return
self
.
doc_to_text
(
self
.
_process_doc
(
sample
)[
0
])
def
prune
(
self
):
"""Create a pruned version of a SCROLLS task dataset containing only inputs
that are less than `max_tokens` when tokenized by each tokenizer
"""
tokenizers
=
[
AutoTokenizer
.
from_pretrained
(
tokenizer
)
for
tokenizer
in
self
.
PRUNE_TOKENIZERS
]
cache
=
{}
def
_filter
(
sample
):
text
=
self
.
_get_prune_text
(
sample
)
cached
=
cache
.
get
(
text
,
None
)
if
cached
is
None
:
for
tokenizer
in
tokenizers
:
if
len
(
tokenizer
(
text
).
input_ids
)
>
self
.
PRUNE_MAX_TOKENS
:
cache
[
text
]
=
False
return
False
cache
[
text
]
=
True
return
True
else
:
return
cached
self
.
dataset
=
self
.
dataset
.
filter
(
_filter
,
num_proc
=
self
.
PRUNE_NUM_PROC
)
def
doc_to_target
(
self
,
doc
):
return
" "
+
", "
.
join
(
doc
[
"outputs"
])
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
def
higher_is_better
(
self
):
return
{
x
:
True
for
x
in
self
.
_scrolls_metrics
().
keys
()}
@
abstractmethod
def
_scrolls_metrics
(
self
):
pass
def
_make_compute_metrics
(
self
,
value
):
def
compute_metrics
(
samples
):
predictions
,
references
=
zip
(
*
samples
)
# unzip, if you will
computed
=
self
.
metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
return
computed
[
value
]
return
compute_metrics
def
aggregation
(
self
):
return
{
key
:
self
.
_make_compute_metrics
(
value
)
for
key
,
value
in
self
.
_scrolls_metrics
().
items
()
}
class
_SCROLLSMultipleChoiceTask
(
_SCROLLSTask
):
def
__post_init__
(
self
):
self
.
metric
=
None
def
_scrolls_metrics
(
self
):
return
None
def
aggregation
(
self
):
return
{
"em"
:
mean
,
"acc"
:
mean
,
"acc_norm"
:
mean
}
def
higher_is_better
(
self
):
return
{
"em"
:
True
,
"acc"
:
True
,
"acc_norm"
:
True
}
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"gold"
]
acc
=
1.0
if
np
.
argmax
(
results
)
==
gold
else
0.0
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.0
if
np
.
argmax
(
results
/
completion_len
)
==
gold
else
0.0
return
{
"acc"
:
acc
,
"acc_norm"
:
acc_norm
,
"em"
:
acc_norm
*
100.0
,
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
request_list
=
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" {}"
.
format
(
choice
)),
idx
=
i
,
**
kwargs
,
)
for
i
,
choice
in
doc
[
"choices"
]
]
return
request_list
class
_SCROLLSSummaryTask
(
_SCROLLSTask
):
def
_process_doc
(
self
,
doc
):
return
[
doc
]
def
_scrolls_metrics
(
self
):
return
{
"rouge1"
:
"rouge/rouge1"
,
"rouge2"
:
"rouge/rouge2"
,
"rougeL"
:
"rouge/rougeL"
,
}
def
process_results
(
self
,
doc
,
results
):
return
{
"rouge1"
:
(
results
[
0
],
doc
[
"outputs"
]),
"rouge2"
:
(
results
[
0
],
doc
[
"outputs"
]),
"rougeL"
:
(
results
[
0
],
doc
[
"outputs"
]),
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'input'
]
}
\n\n
Question: What is a summary of the preceding text?
\n
Answer:"
@
register_task
(
"scrolls_qasper"
)
class
Qasper
(
_SCROLLSTask
):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
"""
DATASET_NAME
=
"qasper"
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
doc
[
"is_yes_no"
]
=
reduce
(
lambda
prev
,
cur
:
prev
and
squad_metrics
.
normalize_answer
(
cur
)
in
[
"yes"
,
"no"
],
doc
[
"outputs"
],
True
,
)
return
[
doc
]
def
_scrolls_metrics
(
self
):
return
{
"f1"
:
"f1"
}
def
process_results
(
self
,
doc
,
results
):
if
doc
[
"is_yes_no"
]:
prediction
=
" yes"
if
results
[
0
]
>
results
[
1
]
else
" no"
elif
len
(
results
[
0
].
strip
())
==
0
:
prediction
=
"Unanswerable"
else
:
prediction
=
results
[
0
]
return
{
"f1"
:
(
prediction
,
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
if
doc
[
"is_yes_no"
]:
return
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" yes"
),
idx
=
0
,
**
kwargs
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" no"
),
idx
=
1
,
**
kwargs
,
),
]
else
:
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
@
register_task
(
"scrolls_quality"
)
class
QuALITY
(
_SCROLLSMultipleChoiceTask
):
"""QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608
"""
DATASET_NAME
=
"quality"
_multiple_choice_pattern
=
re
.
compile
(
r
" *\([A-D]\) *"
)
@
staticmethod
def
_normalize_answer
(
text
):
return
" "
.
join
(
text
.
split
()).
strip
()
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
split
=
doc
[
"text"
].
find
(
"
\n\n
"
,
doc
[
"text"
].
find
(
"(D)"
))
choices_text
=
doc
[
"text"
][:
split
]
doc
[
"text"
]
=
doc
[
"text"
][
split
:].
strip
()
doc
[
"choices"
]
=
[
QuALITY
.
_normalize_answer
(
choice
)
for
choice
in
re
.
split
(
QuALITY
.
_multiple_choice_pattern
,
choices_text
)[
1
:]
]
doc
[
"gold"
]
=
doc
[
"choices"
].
index
(
QuALITY
.
_normalize_answer
(
doc
[
"outputs"
][
0
]))
return
[
doc
]
@
register_task
(
"scrolls_narrativeqa"
)
class
NarrativeQA
(
_SCROLLSTask
):
"""The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040
"""
DATASET_NAME
=
"narrative_qa"
def
_process_doc
(
self
,
doc
):
return
[
_process_doc_prepended_question
(
doc
)]
def
_scrolls_metrics
(
self
):
return
{
"f1"
:
"f1"
}
def
_get_prune_text
(
self
,
doc
):
# pruning narrativeqa takes forever -- let's cheat a bit
# and just cache on the text, not the question, since
# the dataset is different questions about the same large
# documents
return
self
.
_process_doc
(
doc
)[
0
][
"text"
]
def
process_results
(
self
,
doc
,
results
):
return
{
"f1"
:
(
results
[
0
],
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
@
register_task
(
"scrolls_contractnli"
)
class
ContractNLI
(
_SCROLLSMultipleChoiceTask
):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040
"""
DATASET_NAME
=
"contract_nli"
CHOICES
=
[
"Not mentioned"
,
"Entailment"
,
"Contradiction"
]
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
doc
[
"choices"
]
=
ContractNLI
.
CHOICES
doc
[
"gold"
]
=
ContractNLI
.
CHOICES
.
index
(
doc
[
"outputs"
][
0
])
return
[
doc
]
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Hypothesis:
{
doc
[
'question'
]
}
\n
Conclusion:"
@
register_task
(
"scrolls_govreport"
)
class
GovReport
(
_SCROLLSSummaryTask
):
"""Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112
Note: The average length of the reference summaries is ~3,000
characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
it is recommended to set `max_gen_toks` sufficently large (e.g. 1024)
to allow a full summary to be generated.
"""
DATASET_NAME
=
"gov_report"
@
register_task
(
"scrolls_summscreenfd"
)
class
SummScreenFD
(
_SCROLLSSummaryTask
):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091
"""
DATASET_NAME
=
"summ_screen_fd"
@
register_task
(
"scrolls_qmsum"
)
class
QMSum
(
_SCROLLSSummaryTask
):
"""QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization
https://arxiv.org/abs/2104.05938
"""
DATASET_NAME
=
"qmsum"
def
_process_doc
(
self
,
doc
):
return
[
_process_doc_prepended_question
(
doc
)]
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
lm_eval/tasks/squadv2/README.md
View file @
a58674ae
...
@@ -34,12 +34,11 @@ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
...
@@ -34,12 +34,11 @@ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
#### Groups
#### Groups
*
`squadv2_complete`
: Runs both
`squadv2`
and
`squadv2_noans_loglikelihood`
*
Not part of a group yet
#### Tasks
#### Tasks
*
`squadv2`
:
`Default squadv2 task`
*
`squadv2`
:
`Default squadv2 task`
*
`squadv2_noans_loglikelihood`
:
`Additional task to acquire the probability of model predicting there is no answer`
### Checklist
### Checklist
...
...
lm_eval/tasks/squadv2/_template_yaml
deleted
100644 → 0
View file @
10cc0a56
dataset_path: squad_v2
training_split: train
validation_split: validation
doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
target_delimiter: ""
should_decontaminate: true
doc_to_decontamination_query: context
lm_eval/tasks/squadv2/default.yaml
deleted
100644 → 0
View file @
10cc0a56
include
:
_template_yaml
task
:
squadv2
output_type
:
generate_until
generation_kwargs
:
until
:
-
"
\n
"
metric_list
:
-
metric
:
!function
utils.exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
utils.f1
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/squadv2/no_ans.yaml
deleted
100644 → 0
View file @
10cc0a56
include
:
_template_yaml
task
:
squadv2_noans_loglikelihood
output_type
:
loglikelihood
doc_to_target
:
"
unanswerable"
metric_list
:
-
metric
:
perplexity
lm_eval/tasks/squadv2/task.py
0 → 100644
View file @
a58674ae
"""
Know What You Don’t Know: Unanswerable Questions for SQuAD
https://arxiv.org/pdf/1806.03822.pdf
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
consisting of questions posed by crowdworkers on a set of Wikipedia articles,
where the answer to every question is a segment of text, or span, from the
corresponding reading passage, or the question might be unanswerable.
SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
questions written adversarially by crowdworkers to look similar to answerable ones.
To do well on SQuAD2.0, systems must not only answer questions when possible, but
also determine when no answer is supported by the paragraph and abstain from answering.
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import
datasets
from
evaluate
import
load
from
math
import
exp
from
functools
import
partial
from
packaging
import
version
from
lm_eval.api.task
import
Task
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
_CITATION
=
"""
@misc{rajpurkar2018know,
title={Know What You Don't Know: Unanswerable Questions for SQuAD},
author={Pranav Rajpurkar and Robin Jia and Percy Liang},
year={2018},
eprint={1806.03822},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
def
_squad_metric
(
predictions
,
references
):
# squad_metric = load("squad_v2")
squad_metric
=
datasets
.
load_metric
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
def
_squad_agg
(
key
,
items
):
predictions
,
references
=
zip
(
*
items
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
key
,
0
)
@
register_task
(
"squadv2"
)
class
SQuAD2
(
Task
):
VERSION
=
1
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
# HF changed squad on us so we have to make sure we aren't running the old one
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
"1.11.0"
),
"datasets v1.11.0 or later required for SQuAD"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
training_docs
(
self
):
return
self
.
dataset
[
"train"
]
def
validation_docs
(
self
):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
(
"Title: "
+
doc
[
"title"
]
+
"
\n\n
"
+
"Background: "
+
doc
[
"context"
]
+
"
\n\n
"
+
"Question: "
+
doc
[
"question"
]
+
"
\n\n
"
+
"Answer:"
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"context"
]
def
doc_to_target
(
self
,
doc
):
answer_list
=
doc
[
"answers"
][
"text"
]
if
len
(
answer_list
)
>
0
:
answer
=
answer_list
[
0
]
else
:
answer
=
"unanswerable"
return
" "
+
answer
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
return
[
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" "
+
"unanswerable"
),
idx
=
0
,
**
kwargs
),
]
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
continuation
,
(
logprob_unanswerable
,
_
)
=
results
no_answer_probability
=
exp
(
logprob_unanswerable
)
predictions
=
{
"id"
:
doc
[
"id"
],
"prediction_text"
:
continuation
,
"no_answer_probability"
:
no_answer_probability
,
}
references
=
{
"id"
:
doc
[
"id"
],
"answers"
:
doc
[
"answers"
],
}
return
{
"exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
(
predictions
,
references
,
),
# Best exact match (with varying threshold)
"best_f1"
:
(
predictions
,
references
),
# Best F1 (with varying threshold)
}
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"exact"
:
partial
(
_squad_agg
,
"exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
partial
(
_squad_agg
,
"f1"
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
partial
(
_squad_agg
,
"HasAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
partial
(
_squad_agg
,
"HasAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
partial
(
_squad_agg
,
"NoAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
partial
(
_squad_agg
,
"NoAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
partial
(
_squad_agg
,
"best_exact"
),
# Best exact match (with varying threshold)
"best_f1"
:
partial
(
_squad_agg
,
"best_f1"
),
# Best F1 (with varying threshold)
}
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
True
,
# Best exact match (with varying threshold)
"best_f1"
:
True
,
# Best F1 (with varying threshold)
}
lm_eval/tasks/squadv2/utils.py
deleted
100644 → 0
View file @
10cc0a56
import
re
import
string
import
collections
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
regex
=
re
.
compile
(
r
"\b(a|an|the)\b"
,
re
.
UNICODE
)
return
re
.
sub
(
regex
,
" "
,
text
)
def
white_space_fix
(
text
):
return
" "
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
get_tokens
(
s
):
if
not
s
:
return
[]
return
normalize_answer
(
s
).
split
()
# Exact match (the normalized answer exactly match the gold answer)
def
exact
(
predictions
,
references
):
return
int
(
normalize_answer
(
references
[
0
])
==
normalize_answer
(
predictions
[
0
]))
# The F-score of predicted tokens versus the gold answer
def
f1
(
predictions
,
references
):
gold_toks
=
get_tokens
(
references
[
0
])
pred_toks
=
get_tokens
(
predictions
[
0
])
common
=
collections
.
Counter
(
gold_toks
)
&
collections
.
Counter
(
pred_toks
)
num_same
=
sum
(
common
.
values
())
if
len
(
gold_toks
)
==
0
or
len
(
pred_toks
)
==
0
:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return
int
(
gold_toks
==
pred_toks
)
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
pred_toks
)
recall
=
1.0
*
num_same
/
len
(
gold_toks
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
lm_eval/tasks/squadv2/with_noans_prob.yaml
deleted
100644 → 0
View file @
10cc0a56
group
:
squadv2_complete
task
:
-
squadv2
-
squadv2_noans_loglikelihood
tests/models/test_huggingface.py
View file @
a58674ae
...
@@ -15,7 +15,7 @@ class Test_HFLM:
...
@@ -15,7 +15,7 @@ class Test_HFLM:
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k
_yaml
"
)()
# type: ignore
generate_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
...
@@ -115,7 +115,7 @@ class Test_HFLM:
...
@@ -115,7 +115,7 @@ class Test_HFLM:
def
test_logliklihood_rolling
(
self
)
->
None
:
def
test_logliklihood_rolling
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-
2
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-
1
)
def
test_toc_encode
(
self
)
->
None
:
def
test_toc_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment