Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
79aa53b1
Commit
79aa53b1
authored
Aug 30, 2023
by
lintangsutawika
Browse files
running process for drop
parent
f9558ce5
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
182 additions
and
13 deletions
+182
-13
lm_eval/tasks/README.md
lm_eval/tasks/README.md
+1
-1
lm_eval/tasks/drop/README.md
lm_eval/tasks/drop/README.md
+10
-4
lm_eval/tasks/drop/default.yaml
lm_eval/tasks/drop/default.yaml
+16
-3
lm_eval/tasks/drop/utils.py
lm_eval/tasks/drop/utils.py
+155
-5
No files found.
lm_eval/tasks/README.md
View file @
79aa53b1
...
@@ -6,7 +6,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
...
@@ -6,7 +6,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
-
[x] Glue
-
[x] Glue
-
[x] SuperGlue
-
[x] SuperGlue
-
[ ] CoQA (Lintang)
-
[ ] CoQA (Lintang)
-
[
] DROP
(Lintang)
-
[
x
] DROP
-
[x] ~~Lambada~~
-
[x] ~~Lambada~~
-
[x] Lambada (Cloze variants)
-
[x] Lambada (Cloze variants)
-
[x] ~~Lambada (Multilingual)~~
-
[x] ~~Lambada (Multilingual)~~
...
...
lm_eval/tasks/drop/README.md
View file @
79aa53b1
...
@@ -19,19 +19,25 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
...
@@ -19,19 +19,25 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
### Citation
### Citation
```
```
BibTeX-formatted citation goes here
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
```
### Groups and Tasks
### Groups and Tasks
#### Groups
#### Groups
*
`group_name`
:
`Short description`
*
Not part of a group yet.
#### Tasks
#### Tasks
*
`task_name`
:
`1-sentence description of what this particular task does`
*
`drop`
*
`task_name2`
: ...
### Checklist
### Checklist
...
...
lm_eval/tasks/drop/default.yaml
View file @
79aa53b1
...
@@ -2,8 +2,21 @@ task: drop
...
@@ -2,8 +2,21 @@ task: drop
dataset_path
:
EleutherAI/drop
dataset_path
:
EleutherAI/drop
output_type
:
greedy_until
output_type
:
greedy_until
training_split
:
train
training_split
:
train
validation_split
:
test
validation_split
:
validation
doc_to_text
:
"
Passage:
{{passage}}
\n
Question:
{{question}}
\n
Answer:"
process_docs
:
!function
utils.process_docs
doc_to_target
:
"
{{answer}}"
#" {{answer.split('### ')[-1].rstrip()}}"
doc_to_text
:
"
{{passage}}
{{question}}"
doc_to_target
:
"
{{
answer|join(',')}}"
target_delimiter
:
"
"
process_results
:
!function
utils.process_results
should_decontaminate
:
true
should_decontaminate
:
true
doc_to_decontamination_query
:
"
{{passage}}
{{question}}"
doc_to_decontamination_query
:
"
{{passage}}
{{question}}"
generation_kwargs
:
until
:
-
"
."
metric_list
:
-
metric
:
em
aggregation
:
mean
higher_is_better
:
true
-
metric
:
f1
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/drop/utils.py
View file @
79aa53b1
import
re
import
string
def
process_doc
(
dataset
):
import
numpy
as
np
from
scipy.optimize
import
linear_sum_assignment
_ARTICLES
=
re
.
compile
(
r
"\b(a|an|the)\b"
,
re
.
UNICODE
)
def
process_docs
(
dataset
):
def
_process
(
doc
):
def
_process
(
doc
):
return
{
return
{
"id"
:
doc
[
"query_id"
],
"id"
:
doc
[
"query_id"
],
...
@@ -8,6 +15,7 @@ def process_doc(dataset):
...
@@ -8,6 +15,7 @@ def process_doc(dataset):
"question"
:
doc
[
"question"
],
"question"
:
doc
[
"question"
],
"answers"
:
get_answers
(
doc
),
"answers"
:
get_answers
(
doc
),
}
}
return
dataset
.
map
(
_process
)
return
dataset
.
map
(
_process
)
...
@@ -30,9 +38,7 @@ def get_answers(doc):
...
@@ -30,9 +38,7 @@ def get_answers(doc):
answers
=
[]
answers
=
[]
answers_set
=
set
()
answers_set
=
set
()
candidates
=
[
doc
[
"answer"
]]
+
_flatten_validated_answers
(
candidates
=
[
doc
[
"answer"
]]
+
_flatten_validated_answers
(
doc
[
"validated_answers"
])
doc
[
"validated_answers"
]
)
for
candidate
in
candidates
:
for
candidate
in
candidates
:
answer
=
parse_answer
(
candidate
)
answer
=
parse_answer
(
candidate
)
if
answer
in
answers_set
:
if
answer
in
answers_set
:
...
@@ -41,6 +47,7 @@ def get_answers(doc):
...
@@ -41,6 +47,7 @@ def get_answers(doc):
answers
.
append
(
answer
)
answers
.
append
(
answer
)
return
answers
return
answers
def
parse_answer
(
answer
):
def
parse_answer
(
answer
):
# NOTE: Everything is returned as a tuple for uniformity and hashability.
# NOTE: Everything is returned as a tuple for uniformity and hashability.
if
answer
[
"number"
]
!=
""
:
if
answer
[
"number"
]
!=
""
:
...
@@ -51,4 +58,147 @@ def parse_answer(answer):
...
@@ -51,4 +58,147 @@ def parse_answer(answer):
" "
.
join
(
" "
.
join
(
[
answer
[
"date"
][
"day"
],
answer
[
"date"
][
"month"
],
answer
[
"date"
][
"year"
]]
[
answer
[
"date"
][
"day"
],
answer
[
"date"
][
"month"
],
answer
[
"date"
][
"year"
]]
).
strip
(),
).
strip
(),
)
)
\ No newline at end of file
def
process_results
(
doc
,
results
):
preds
,
golds
=
results
,
doc
[
"answers"
]
max_em
=
0
max_f1
=
0
for
gold_answer
in
golds
:
exact_match
,
f1_score
=
get_metrics
(
preds
,
gold_answer
)
if
gold_answer
[
0
].
strip
():
max_em
=
max
(
max_em
,
exact_match
)
max_f1
=
max
(
max_f1
,
f1_score
)
return
{
"em"
:
max_em
,
"f1"
:
max_f1
}
def
get_metrics
(
predicted
,
gold
):
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
predicted_bags
=
_answer_to_bags
(
predicted
)
gold_bags
=
_answer_to_bags
(
gold
)
if
set
(
predicted_bags
[
0
])
==
set
(
gold_bags
[
0
])
and
len
(
predicted_bags
[
0
])
==
len
(
gold_bags
[
0
]
):
exact_match
=
1.0
else
:
exact_match
=
0.0
f1_per_bag
=
_align_bags
(
predicted_bags
[
1
],
gold_bags
[
1
])
f1
=
np
.
mean
(
f1_per_bag
)
f1
=
round
(
f1
,
2
)
return
exact_match
,
f1
def
_answer_to_bags
(
answer
):
if
isinstance
(
answer
,
(
list
,
tuple
)):
raw_spans
=
answer
else
:
raw_spans
=
[
answer
]
normalized_spans
=
[]
token_bags
=
[]
for
raw_span
in
raw_spans
:
normalized_span
=
_normalize
(
raw_span
)
normalized_spans
.
append
(
normalized_span
)
token_bags
.
append
(
set
(
normalized_span
.
split
()))
return
normalized_spans
,
token_bags
def
_align_bags
(
predicted
,
gold
):
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores
=
np
.
zeros
([
len
(
gold
),
len
(
predicted
)])
for
gold_index
,
gold_item
in
enumerate
(
gold
):
for
pred_index
,
pred_item
in
enumerate
(
predicted
):
if
_match_numbers_if_present
(
gold_item
,
pred_item
):
scores
[
gold_index
,
pred_index
]
=
_compute_f1
(
pred_item
,
gold_item
)
row_ind
,
col_ind
=
linear_sum_assignment
(
-
scores
)
max_scores
=
np
.
zeros
([
max
(
len
(
gold
),
len
(
predicted
))])
for
row
,
column
in
zip
(
row_ind
,
col_ind
):
max_scores
[
row
]
=
max
(
max_scores
[
row
],
scores
[
row
,
column
])
return
max_scores
def
_compute_f1
(
predicted_bag
,
gold_bag
):
intersection
=
len
(
gold_bag
.
intersection
(
predicted_bag
))
if
not
predicted_bag
:
precision
=
1.0
else
:
precision
=
intersection
/
float
(
len
(
predicted_bag
))
if
not
gold_bag
:
recall
=
1.0
else
:
recall
=
intersection
/
float
(
len
(
gold_bag
))
f1
=
(
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
if
not
(
precision
==
0.0
and
recall
==
0.0
)
else
0.0
)
return
f1
def
_match_numbers_if_present
(
gold_bag
,
predicted_bag
):
gold_numbers
=
set
()
predicted_numbers
=
set
()
for
word
in
gold_bag
:
if
_is_number
(
word
):
gold_numbers
.
add
(
word
)
for
word
in
predicted_bag
:
if
_is_number
(
word
):
predicted_numbers
.
add
(
word
)
if
(
not
gold_numbers
)
or
gold_numbers
.
intersection
(
predicted_numbers
):
return
True
return
False
def
_is_number
(
text
):
try
:
float
(
text
)
return
True
except
ValueError
:
return
False
def
_remove_articles
(
text
):
return
_ARTICLES
.
sub
(
" "
,
text
)
def
_white_space_fix
(
text
):
return
" "
.
join
(
text
.
split
())
def
_remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
if
not
_is_number
(
text
):
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
else
:
return
text
def
_fix_number
(
text
):
return
str
(
float
(
text
))
if
_is_number
(
text
)
else
text
def
_tokenize
(
text
):
return
re
.
split
(
" |-"
,
text
)
def
_normalize
(
answer
):
tokens
=
[
_white_space_fix
(
_remove_articles
(
_fix_number
(
_remove_punc
(
token
.
lower
()))))
for
token
in
_tokenize
(
answer
)
]
tokens
=
[
token
for
token
in
tokens
if
token
.
strip
()]
normalized
=
" "
.
join
(
tokens
).
strip
()
return
normalized
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment