Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
75137836
Commit
75137836
authored
Nov 09, 2023
by
lintangsutawika
Browse files
add scrolls
parent
a3520619
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
490 additions
and
0 deletions
+490
-0
lm_eval/tasks/scrolls.py
lm_eval/tasks/scrolls.py
+490
-0
No files found.
lm_eval/tasks/scrolls.py
0 → 100644
View file @
75137836
"""
SCROLLS: Standardized CompaRison Over Long Language Sequences
https://arxiv.org/abs/2201.03533
SCROLLS is a suite of datasets that require synthesizing information over long texts.
The benchmark includes seven natural language tasks across multiple domains,
including summarization, question answering, and natural language inference.
Homepage: https://www.scrolls-benchmark.com/
Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
it is possible to create "subset" tasks that contain only those samples whose tokenized length
is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
```
class QasperGPTNeoX4K(Qasper):
PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
PRUNE_MAX_TOKENS = 4096
PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
```
`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
that use different tokenizers but the same maximum sequence length.
Once the subset task class has been defined in this file, it can be used by adding the class
to `lm_eval/tasks/__init__.py`.
NOTE: GovReport may need `max_gen_toks` set larger for causal models.
"""
import
re
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
abc
import
abstractmethod
from
datasets
import
load_metric
from
transformers
import
AutoTokenizer
from
functools
import
reduce
from
lm_eval.api.task
import
Task
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
_CITATION
=
"""
@inproceedings{shaham-etal-2022-scrolls,
title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
author = "Shaham, Uri and
Segal, Elad and
Ivgi, Maor and
Efrat, Avia and
Yoran, Ori and
Haviv, Adi and
Gupta, Ankit and
Xiong, Wenhan and
Geva, Mor and
Berant, Jonathan and
Levy, Omer",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.823",
pages = "12007--12021"
}
"""
# SCROLLS is formualted as a sequence-to-sequence task.
# To allow for evaluation of causal models, we'll
# reformualte these with appropriate prompts
def
_download_metric
():
import
os
import
shutil
from
huggingface_hub
import
hf_hub_download
scrolls_metric_path
=
hf_hub_download
(
repo_id
=
"tau/scrolls"
,
repo_type
=
"dataset"
,
filename
=
"metrics/scrolls.py"
)
updated_scrolls_metric_path
=
(
os
.
path
.
dirname
(
scrolls_metric_path
)
+
os
.
path
.
basename
(
scrolls_metric_path
).
replace
(
"."
,
"_"
)
+
".py"
)
shutil
.
copy
(
scrolls_metric_path
,
updated_scrolls_metric_path
)
return
updated_scrolls_metric_path
def
_process_doc_prepended_question
(
doc
):
# "When a query is given in addition to the raw text (as
# in QMSum, Qasper, NarrativeQA, QuALITY, and ContractNLI),
# we prepend it to the text, using two newlines as a natural separator"
input
=
doc
[
"input"
]
split
=
input
.
find
(
"
\n\n
"
)
return
{
"id"
:
doc
[
"id"
],
"pid"
:
doc
[
"pid"
],
"input"
:
input
,
"outputs"
:
doc
[
"outputs"
],
"question"
:
input
[
0
:
split
],
"text"
:
input
[
split
+
2
:],
}
def
_drop_duplicates_in_input
(
untokenized_dataset
):
# from scrolls/evaluator/dataset_evaluator.py
indices_to_keep
=
[]
id_to_idx
=
{}
outputs
=
[]
for
i
,
(
id_
,
output
)
in
enumerate
(
zip
(
untokenized_dataset
[
"id"
],
untokenized_dataset
[
"output"
])
):
if
id_
in
id_to_idx
:
outputs
[
id_to_idx
[
id_
]].
append
(
output
)
continue
indices_to_keep
.
append
(
i
)
id_to_idx
[
id_
]
=
len
(
outputs
)
outputs
.
append
([
output
])
untokenized_dataset
=
untokenized_dataset
.
select
(
indices_to_keep
).
flatten_indices
()
untokenized_dataset
=
untokenized_dataset
.
remove_columns
(
"output"
)
untokenized_dataset
=
untokenized_dataset
.
add_column
(
"outputs"
,
outputs
)
return
untokenized_dataset
def
_num_cpu_cores
():
# https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
try
:
import
psutil
return
psutil
.
cpu_count
(
logical
=
False
)
except
ImportError
:
import
os
return
len
(
os
.
sched_getaffinity
(
0
))
class
_SCROLLSTask
(
Task
):
VERSION
=
0
DATASET_PATH
=
"tau/scrolls"
DATASET_NAME
=
None
PRUNE_TOKENIZERS
=
None
PRUNE_MAX_TOKENS
=
None
PRUNE_NUM_PROC
=
None
def
__init__
(
self
,
no_metric
=
False
):
super
().
__init__
()
self
.
metric
=
(
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
if
not
no_metric
else
None
)
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
training_docs
(
self
):
for
doc
in
self
.
dataset
[
"train"
]:
yield
from
self
.
_process_doc
(
doc
)
def
validation_docs
(
self
):
for
doc
in
self
.
dataset
[
"validation"
]:
yield
from
self
.
_process_doc
(
doc
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"input"
]
def
download
(
self
,
*
args
,
**
kwargs
):
super
().
download
(
*
args
,
**
kwargs
)
del
self
.
dataset
[
"test"
]
for
split
in
self
.
dataset
:
self
.
dataset
[
split
]
=
_drop_duplicates_in_input
(
self
.
dataset
[
split
])
if
self
.
PRUNE_TOKENIZERS
is
not
None
and
self
.
PRUNE_TOKENIZERS
is
not
None
:
self
.
prune
()
def
_get_prune_text
(
self
,
sample
):
return
self
.
doc_to_text
(
self
.
_process_doc
(
sample
)[
0
])
def
prune
(
self
):
"""Create a pruned version of a SCROLLS task dataset containing only inputs
that are less than `max_tokens` when tokenized by each tokenizer
"""
tokenizers
=
[
AutoTokenizer
.
from_pretrained
(
tokenizer
)
for
tokenizer
in
self
.
PRUNE_TOKENIZERS
]
cache
=
{}
def
_filter
(
sample
):
text
=
self
.
_get_prune_text
(
sample
)
cached
=
cache
.
get
(
text
,
None
)
if
cached
is
None
:
for
tokenizer
in
tokenizers
:
if
len
(
tokenizer
(
text
).
input_ids
)
>
self
.
PRUNE_MAX_TOKENS
:
cache
[
text
]
=
False
return
False
cache
[
text
]
=
True
return
True
else
:
return
cached
self
.
dataset
=
self
.
dataset
.
filter
(
_filter
,
num_proc
=
self
.
PRUNE_NUM_PROC
)
def
doc_to_target
(
self
,
doc
):
return
" "
+
", "
.
join
(
doc
[
"outputs"
])
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
def
higher_is_better
(
self
):
return
{
x
:
True
for
x
in
self
.
_scrolls_metrics
().
keys
()}
@
abstractmethod
def
_scrolls_metrics
(
self
):
pass
def
_make_compute_metrics
(
self
,
value
):
def
compute_metrics
(
samples
):
predictions
,
references
=
zip
(
*
samples
)
# unzip, if you will
computed
=
self
.
metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
return
computed
[
value
]
return
compute_metrics
def
aggregation
(
self
):
return
{
key
:
self
.
_make_compute_metrics
(
value
)
for
key
,
value
in
self
.
_scrolls_metrics
().
items
()
}
class
_SCROLLSMultipleChoiceTask
(
_SCROLLSTask
):
def
__init__
(
self
):
super
().
__init__
(
no_metric
=
True
)
def
_scrolls_metrics
(
self
):
return
None
def
aggregation
(
self
):
return
{
"em"
:
mean
,
"acc"
:
mean
,
"acc_norm"
:
mean
}
def
higher_is_better
(
self
):
return
{
"em"
:
True
,
"acc"
:
True
,
"acc_norm"
:
True
}
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"gold"
]
acc
=
1.0
if
np
.
argmax
(
results
)
==
gold
else
0.0
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.0
if
np
.
argmax
(
results
/
completion_len
)
==
gold
else
0.0
return
{
"acc"
:
acc
,
"acc_norm"
:
acc_norm
,
"em"
:
acc_norm
*
100.0
,
}
def
construct_requests
(
self
,
doc
,
ctx
):
request_list
=
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" {}"
.
format
(
choice
)),
idx
=
i
,
)
for
i
,
choice
in
doc
[
"choices"
]
]
return
request_list
class
_SCROLLSSummaryTask
(
_SCROLLSTask
):
def
_process_doc
(
self
,
doc
):
return
[
doc
]
def
_scrolls_metrics
(
self
):
return
{
"rouge1"
:
"rouge/rouge1"
,
"rouge2"
:
"rouge/rouge2"
,
"rougeL"
:
"rouge/rougeL"
,
}
def
process_results
(
self
,
doc
,
results
):
return
{
"rouge1"
:
(
results
[
0
],
doc
[
"outputs"
]),
"rouge2"
:
(
results
[
0
],
doc
[
"outputs"
]),
"rougeL"
:
(
results
[
0
],
doc
[
"outputs"
]),
}
def
construct_requests
(
self
,
doc
,
ctx
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
)
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'input'
]
}
\n\n
Question: What is a summary of the preceding text?
\n
Answer:"
@
register_task
(
"scrolls_qasper"
)
class
Qasper
(
_SCROLLSTask
):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
"""
DATASET_NAME
=
"qasper"
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
doc
[
"is_yes_no"
]
=
reduce
(
lambda
prev
,
cur
:
prev
and
squad_metrics
.
normalize_answer
(
cur
)
in
[
"yes"
,
"no"
],
doc
[
"outputs"
],
True
,
)
return
[
doc
]
def
_scrolls_metrics
(
self
):
return
{
"f1"
:
"f1"
}
def
process_results
(
self
,
doc
,
results
):
if
doc
[
"is_yes_no"
]:
prediction
=
" yes"
if
results
[
0
]
>
results
[
1
]
else
" no"
elif
len
(
results
[
0
].
strip
())
==
0
:
prediction
=
"Unanswerable"
else
:
prediction
=
results
[
0
]
return
{
"f1"
:
(
prediction
,
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
):
if
doc
[
"is_yes_no"
]:
return
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" yes"
),
idx
=
0
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" no"
),
idx
=
1
,
),
]
else
:
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
)
@
register_task
(
"scrolls_quality"
)
class
QuALITY
(
_SCROLLSMultipleChoiceTask
):
"""QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608
"""
DATASET_NAME
=
"quality"
_multiple_choice_pattern
=
re
.
compile
(
r
" *\([A-D]\) *"
)
@
staticmethod
def
_normalize_answer
(
text
):
return
" "
.
join
(
text
.
split
()).
strip
()
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
split
=
doc
[
"text"
].
find
(
"
\n\n
"
,
doc
[
"text"
].
find
(
"(D)"
))
choices_text
=
doc
[
"text"
][:
split
]
doc
[
"text"
]
=
doc
[
"text"
][
split
:].
strip
()
doc
[
"choices"
]
=
[
QuALITY
.
_normalize_answer
(
choice
)
for
choice
in
re
.
split
(
QuALITY
.
_multiple_choice_pattern
,
choices_text
)[
1
:]
]
doc
[
"gold"
]
=
doc
[
"choices"
].
index
(
QuALITY
.
_normalize_answer
(
doc
[
"outputs"
][
0
]))
return
[
doc
]
@
register_task
(
"scrolls_narrativeqa"
)
class
NarrativeQA
(
_SCROLLSTask
):
"""The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040
"""
DATASET_NAME
=
"narrative_qa"
def
_process_doc
(
self
,
doc
):
return
[
_process_doc_prepended_question
(
doc
)]
def
_scrolls_metrics
(
self
):
return
{
"f1"
:
"f1"
}
def
_get_prune_text
(
self
,
doc
):
# pruning narrativeqa takes forever -- let's cheat a bit
# and just cache on the text, not the question, since
# the dataset is different questions about the same large
# documents
return
self
.
_process_doc
(
doc
)[
0
][
"text"
]
def
process_results
(
self
,
doc
,
results
):
return
{
"f1"
:
(
results
[
0
],
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
)
@
register_task
(
"scrolls_contractnli"
)
class
ContractNLI
(
_SCROLLSMultipleChoiceTask
):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040
"""
DATASET_NAME
=
"contract_nli"
CHOICES
=
[
"Not mentioned"
,
"Entailment"
,
"Contradiction"
]
def
_process_doc
(
self
,
doc
):
doc
=
_process_doc_prepended_question
(
doc
)
doc
[
"choices"
]
=
ContractNLI
.
CHOICES
doc
[
"gold"
]
=
ContractNLI
.
CHOICES
.
index
(
doc
[
"outputs"
][
0
])
return
[
doc
]
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Hypothesis:
{
doc
[
'question'
]
}
\n
Conclusion:"
@
register_task
(
"scrolls_govreport"
)
class
GovReport
(
_SCROLLSSummaryTask
):
"""Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112
Note: The average length of the reference summaries is ~3,000
characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
it is recommended to set `max_gen_toks` sufficently large (e.g. 1024)
to allow a full summary to be generated.
"""
DATASET_NAME
=
"gov_report"
@
register_task
(
"scrolls_summscreenfd"
)
class
SummScreenFD
(
_SCROLLSSummaryTask
):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091
"""
DATASET_NAME
=
"summ_screen_fd"
@
register_task
(
"scrolls_qmsum"
)
class
QMSum
(
_SCROLLSSummaryTask
):
"""QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization
https://arxiv.org/abs/2104.05938
"""
DATASET_NAME
=
"qmsum"
def
_process_doc
(
self
,
doc
):
return
[
_process_doc_prepended_question
(
doc
)]
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'text'
]
}
\n\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment