Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
51753750
Commit
51753750
authored
Nov 09, 2023
by
lintangsutawika
Browse files
updates to scrolls
parent
097c9253
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
45 deletions
+62
-45
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+8
-1
lm_eval/tasks/scrolls/README.md
lm_eval/tasks/scrolls/README.md
+31
-0
lm_eval/tasks/scrolls/scrolls.yaml
lm_eval/tasks/scrolls/scrolls.yaml
+9
-0
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+14
-44
No files found.
lm_eval/tasks/__init__.py
View file @
51753750
...
...
@@ -17,7 +17,14 @@ import logging
# import python tasks
from
.squad
import
SQuAD2
from
.scrolls
import
QuALITY
,
NarrativeQA
,
ContractNLI
,
GovReport
,
SummScreenFD
,
QMSum
from
.scrolls.task
import
(
QuALITY
,
NarrativeQA
,
ContractNLI
,
GovReport
,
SummScreenFD
,
QMSum
,
)
eval_logger
=
logging
.
getLogger
(
"lm-eval"
)
...
...
lm_eval/tasks/scrolls/README.md
0 → 100644
View file @
51753750
"""
SCROLLS: Standardized CompaRison Over Long Language Sequences
https://arxiv.org/abs/2201.03533
SCROLLS is a suite of datasets that require synthesizing information over long texts.
The benchmark includes seven natural language tasks across multiple domains,
including summarization, question answering, and natural language inference.
Homepage: https://www.scrolls-benchmark.com/
Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
it is possible to create "subset" tasks that contain only those samples whose tokenized length
is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
```
class QasperGPTNeoX4K(Qasper):
PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
PRUNE_MAX_TOKENS = 4096
PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
```
`PRUNE_TOKENIZERS`
can contain more than one tokenizer; this will include only samples that are
less than
`PRUNE_MAX_TOKENS`
for ALL of the tokenizers. This can be useful to comparing models
that use different tokenizers but the same maximum sequence length.
Once the subset task class has been defined in this file, it can be used by adding the class
to
`lm_eval/tasks/__init__.py`
.
NOTE: GovReport may need
`max_gen_toks`
set larger for causal models.
"""
\ No newline at end of file
lm_eval/tasks/scrolls/scrolls.yaml
0 → 100644
View file @
51753750
group
:
scrolls
task
:
-
scrolls_qasper
-
scrolls_quality
-
scrolls_narrativeqa
-
scrolls_contractnli
-
scrolls_govreport
-
scrolls_summscreenfd
-
scrolls_qmsum
lm_eval/tasks/scrolls.py
→
lm_eval/tasks/scrolls
/task
.py
View file @
51753750
"""
SCROLLS: Standardized CompaRison Over Long Language Sequences
https://arxiv.org/abs/2201.03533
SCROLLS is a suite of datasets that require synthesizing information over long texts.
The benchmark includes seven natural language tasks across multiple domains,
including summarization, question answering, and natural language inference.
Homepage: https://www.scrolls-benchmark.com/
Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
it is possible to create "subset" tasks that contain only those samples whose tokenized length
is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
```
class QasperGPTNeoX4K(Qasper):
PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
PRUNE_MAX_TOKENS = 4096
PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
```
`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
that use different tokenizers but the same maximum sequence length.
Once the subset task class has been defined in this file, it can be used by adding the class
to `lm_eval/tasks/__init__.py`.
NOTE: GovReport may need `max_gen_toks` set larger for causal models.
"""
import
re
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
...
...
@@ -146,13 +115,8 @@ class _SCROLLSTask(Task):
PRUNE_MAX_TOKENS
=
None
PRUNE_NUM_PROC
=
None
def
__init__
(
self
,
no_metric
=
False
):
super
().
__init__
()
self
.
metric
=
(
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
if
not
no_metric
else
None
)
def
__post_init__
(
self
):
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
return
True
...
...
@@ -245,8 +209,8 @@ class _SCROLLSTask(Task):
class
_SCROLLSMultipleChoiceTask
(
_SCROLLSTask
):
def
__init__
(
self
):
s
uper
().
__init__
(
no_metric
=
True
)
def
__
post_
init__
(
self
):
s
elf
.
metric
=
None
def
_scrolls_metrics
(
self
):
return
None
...
...
@@ -270,7 +234,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
"em"
:
acc_norm
*
100.0
,
}
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
request_list
=
[
Instance
(
...
...
@@ -278,6 +242,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
doc
=
doc
,
arguments
=
(
ctx
,
" {}"
.
format
(
choice
)),
idx
=
i
,
**
kwargs
,
)
for
i
,
choice
in
doc
[
"choices"
]
]
...
...
@@ -302,12 +267,13 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
"rougeL"
:
(
results
[
0
],
doc
[
"outputs"
]),
}
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
def
doc_to_text
(
self
,
doc
):
...
...
@@ -344,7 +310,7 @@ class Qasper(_SCROLLSTask):
prediction
=
results
[
0
]
return
{
"f1"
:
(
prediction
,
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
if
doc
[
"is_yes_no"
]:
return
[
Instance
(
...
...
@@ -352,12 +318,14 @@ class Qasper(_SCROLLSTask):
doc
=
doc
,
arguments
=
(
ctx
,
" yes"
),
idx
=
0
,
**
kwargs
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" no"
),
idx
=
1
,
**
kwargs
,
),
]
else
:
...
...
@@ -366,6 +334,7 @@ class Qasper(_SCROLLSTask):
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
...
...
@@ -422,12 +391,13 @@ class NarrativeQA(_SCROLLSTask):
def
process_results
(
self
,
doc
,
results
):
return
{
"f1"
:
(
results
[
0
],
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment