Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
c21240c0
Commit
c21240c0
authored
Nov 20, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into alt_worlds
parents
bbd6ab3a
afda6551
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
266 additions
and
89 deletions
+266
-89
lm_eval/tasks/squadv2/_template_yaml
lm_eval/tasks/squadv2/_template_yaml
+0
-8
lm_eval/tasks/squadv2/default.yaml
lm_eval/tasks/squadv2/default.yaml
+0
-13
lm_eval/tasks/squadv2/no_ans.yaml
lm_eval/tasks/squadv2/no_ans.yaml
+0
-6
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+240
-0
lm_eval/tasks/squadv2/utils.py
lm_eval/tasks/squadv2/utils.py
+0
-51
lm_eval/tasks/squadv2/with_noans_prob.yaml
lm_eval/tasks/squadv2/with_noans_prob.yaml
+0
-4
lm_eval/utils.py
lm_eval/utils.py
+10
-1
scripts/write_out.py
scripts/write_out.py
+10
-3
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+4
-2
tests/test_evaluator.py
tests/test_evaluator.py
+1
-0
tests/test_tasks.py
tests/test_tasks.py
+1
-1
No files found.
lm_eval/tasks/squadv2/_template_yaml
deleted
100644 → 0
View file @
bbd6ab3a
dataset_path: squad_v2
training_split: train
validation_split: validation
doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
target_delimiter: ""
should_decontaminate: true
doc_to_decontamination_query: context
lm_eval/tasks/squadv2/default.yaml
deleted
100644 → 0
View file @
bbd6ab3a
include
:
_template_yaml
task
:
squadv2
output_type
:
generate_until
generation_kwargs
:
until
:
-
"
\n
"
metric_list
:
-
metric
:
!function
utils.exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
utils.f1
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/squadv2/no_ans.yaml
deleted
100644 → 0
View file @
bbd6ab3a
include
:
_template_yaml
task
:
squadv2_noans_loglikelihood
output_type
:
loglikelihood
doc_to_target
:
"
unanswerable"
metric_list
:
-
metric
:
perplexity
lm_eval/tasks/squadv2/task.py
0 → 100644
View file @
c21240c0
"""
Know What You Don’t Know: Unanswerable Questions for SQuAD
https://arxiv.org/pdf/1806.03822.pdf
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
consisting of questions posed by crowdworkers on a set of Wikipedia articles,
where the answer to every question is a segment of text, or span, from the
corresponding reading passage, or the question might be unanswerable.
SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
questions written adversarially by crowdworkers to look similar to answerable ones.
To do well on SQuAD2.0, systems must not only answer questions when possible, but
also determine when no answer is supported by the paragraph and abstain from answering.
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import
datasets
from
evaluate
import
load
from
math
import
exp
from
functools
import
partial
from
packaging
import
version
from
lm_eval.api.task
import
Task
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_task
_CITATION
=
"""
@misc{rajpurkar2018know,
title={Know What You Don't Know: Unanswerable Questions for SQuAD},
author={Pranav Rajpurkar and Robin Jia and Percy Liang},
year={2018},
eprint={1806.03822},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
def
_squad_metric
(
predictions
,
references
):
# squad_metric = load("squad_v2")
squad_metric
=
datasets
.
load_metric
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
def
_squad_agg
(
key
,
items
):
predictions
,
references
=
zip
(
*
items
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
key
,
0
)
@
register_task
(
"squadv2"
)
class
SQuAD2
(
Task
):
VERSION
=
1
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
# HF changed squad on us so we have to make sure we aren't running the old one
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
"1.11.0"
),
"datasets v1.11.0 or later required for SQuAD"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
training_docs
(
self
):
return
self
.
dataset
[
"train"
]
def
validation_docs
(
self
):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
(
"Title: "
+
doc
[
"title"
]
+
"
\n\n
"
+
"Background: "
+
doc
[
"context"
]
+
"
\n\n
"
+
"Question: "
+
doc
[
"question"
]
+
"
\n\n
"
+
"Answer:"
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"context"
]
def
doc_to_target
(
self
,
doc
):
answer_list
=
doc
[
"answers"
][
"text"
]
if
len
(
answer_list
)
>
0
:
answer
=
answer_list
[
0
]
else
:
answer
=
"unanswerable"
return
" "
+
answer
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
return
[
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" "
+
"unanswerable"
),
idx
=
0
,
**
kwargs
),
]
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
continuation
,
(
logprob_unanswerable
,
_
)
=
results
no_answer_probability
=
exp
(
logprob_unanswerable
)
predictions
=
{
"id"
:
doc
[
"id"
],
"prediction_text"
:
continuation
,
"no_answer_probability"
:
no_answer_probability
,
}
references
=
{
"id"
:
doc
[
"id"
],
"answers"
:
doc
[
"answers"
],
}
return
{
"exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
(
predictions
,
references
,
),
# Best exact match (with varying threshold)
"best_f1"
:
(
predictions
,
references
),
# Best F1 (with varying threshold)
}
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"exact"
:
partial
(
_squad_agg
,
"exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
partial
(
_squad_agg
,
"f1"
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
partial
(
_squad_agg
,
"HasAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
partial
(
_squad_agg
,
"HasAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
partial
(
_squad_agg
,
"NoAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
partial
(
_squad_agg
,
"NoAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
partial
(
_squad_agg
,
"best_exact"
),
# Best exact match (with varying threshold)
"best_f1"
:
partial
(
_squad_agg
,
"best_f1"
),
# Best F1 (with varying threshold)
}
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
True
,
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
True
,
# Best exact match (with varying threshold)
"best_f1"
:
True
,
# Best F1 (with varying threshold)
}
lm_eval/tasks/squadv2/utils.py
deleted
100644 → 0
View file @
bbd6ab3a
import
re
import
string
import
collections
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
regex
=
re
.
compile
(
r
"\b(a|an|the)\b"
,
re
.
UNICODE
)
return
re
.
sub
(
regex
,
" "
,
text
)
def
white_space_fix
(
text
):
return
" "
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
get_tokens
(
s
):
if
not
s
:
return
[]
return
normalize_answer
(
s
).
split
()
# Exact match (the normalized answer exactly match the gold answer)
def
exact
(
predictions
,
references
):
return
int
(
normalize_answer
(
references
[
0
])
==
normalize_answer
(
predictions
[
0
]))
# The F-score of predicted tokens versus the gold answer
def
f1
(
predictions
,
references
):
gold_toks
=
get_tokens
(
references
[
0
])
pred_toks
=
get_tokens
(
predictions
[
0
])
common
=
collections
.
Counter
(
gold_toks
)
&
collections
.
Counter
(
pred_toks
)
num_same
=
sum
(
common
.
values
())
if
len
(
gold_toks
)
==
0
or
len
(
pred_toks
)
==
0
:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return
int
(
gold_toks
==
pred_toks
)
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
pred_toks
)
recall
=
1.0
*
num_same
/
len
(
gold_toks
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
lm_eval/tasks/squadv2/with_noans_prob.yaml
deleted
100644 → 0
View file @
bbd6ab3a
group
:
squadv2_complete
task
:
-
squadv2
-
squadv2_noans_loglikelihood
lm_eval/utils.py
View file @
c21240c0
...
...
@@ -20,7 +20,16 @@ import numpy as np
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
itertools
import
islice
from
lm_eval.logger
import
eval_logger
import
logging
logging
.
basicConfig
(
format
=
"%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s"
,
datefmt
=
"%Y-%m-%d:%H:%M:%S"
,
level
=
logging
.
INFO
,
)
eval_logger
=
logging
.
getLogger
(
"lm-eval"
)
SPACING
=
" "
*
47
def
escaped_split
(
text
,
sep_char
,
maxsplit
=-
1
):
...
...
scripts/write_out.py
View file @
c21240c0
...
...
@@ -4,9 +4,8 @@ import json
import
os
import
random
from
lm_eval
import
tasks
from
lm_eval.utils
import
join_iters
from
lm_eval.tasks
import
include_path
from
lm_eval.logger
import
eval_logger
from
lm_eval.utils
import
join_iters
,
eval_logger
from
lm_eval.tasks
import
initialize_tasks
,
include_path
EXAMPLE_DIVIDER
=
"!!@@##@@!! -- Example {i}
\n
"
...
...
@@ -25,6 +24,12 @@ def parse_args():
default
=
None
,
help
=
"Additional path to include if there are external tasks to include."
,
)
parser
.
add_argument
(
"--verbosity"
,
type
=
str
,
default
=
"INFO"
,
help
=
"Log error when tasks are not registered."
,
)
return
parser
.
parse_args
()
...
...
@@ -32,6 +37,8 @@ def main():
args
=
parse_args
()
np
.
random
.
seed
(
args
.
seed
)
initialize_tasks
(
args
.
verbosity
)
if
args
.
include_path
is
not
None
:
eval_logger
.
info
(
f
"Including path:
{
args
.
include_path
}
"
)
include_path
(
args
.
include_path
)
...
...
tests/models/test_huggingface.py
View file @
c21240c0
...
...
@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks
import
sys
import
torch
tasks
.
initialize_tasks
()
class
Test_HFLM
:
torch
.
use_deterministic_algorithms
(
True
)
...
...
@@ -15,7 +17,7 @@ class Test_HFLM:
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k
_yaml
"
)()
# type: ignore
generate_until_task
=
tasks
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
...
...
@@ -115,7 +117,7 @@ class Test_HFLM:
def
test_logliklihood_rolling
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood_rolling
(
self
.
ROLLING
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-
2
)
assert
np
.
allclose
(
res
,
self
.
ROLLING_RES
,
atol
=
1e-
1
)
def
test_toc_encode
(
self
)
->
None
:
res
=
self
.
LM
.
tok_encode
(
"foo bar"
)
...
...
tests/test_evaluator.py
View file @
c21240c0
...
...
@@ -11,6 +11,7 @@ from typing import List
import
random
import
pytest
tasks
.
initialize_tasks
()
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
...
...
tests/test_tasks.py
View file @
c21240c0
...
...
@@ -4,7 +4,7 @@ from .utils import new_tasks
import
lm_eval.tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
tasks
.
initialize_tasks
()
# Default Task
TASKS
=
[
"arc_easy"
]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment