Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cda25fef
Unverified
Commit
cda25fef
authored
Jan 02, 2024
by
Lintang Sutawika
Committed by
GitHub
Jan 02, 2024
Browse files
Merge branch 'main' into standardize_metrics
parents
dfb41835
4d10ad56
Changes
249
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
41 additions
and
25 deletions
+41
-25
lm_eval/tasks/pubmedqa/pubmedqa.yaml
lm_eval/tasks/pubmedqa/pubmedqa.yaml
+1
-1
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+1
-1
lm_eval/tasks/qasper/bool.yaml
lm_eval/tasks/qasper/bool.yaml
+1
-1
lm_eval/tasks/qasper/freeform.yaml
lm_eval/tasks/qasper/freeform.yaml
+1
-1
lm_eval/tasks/qasper/utils.py
lm_eval/tasks/qasper/utils.py
+0
-1
lm_eval/tasks/race/README.md
lm_eval/tasks/race/README.md
+19
-1
lm_eval/tasks/race/race.yaml
lm_eval/tasks/race/race.yaml
+1
-1
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
+1
-1
lm_eval/tasks/sciq/sciq.yaml
lm_eval/tasks/sciq/sciq.yaml
+1
-1
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+0
-1
lm_eval/tasks/siqa/siqa.yaml
lm_eval/tasks/siqa/siqa.yaml
+6
-3
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+2
-3
lm_eval/tasks/storycloze/storycloze_2016.yaml
lm_eval/tasks/storycloze/storycloze_2016.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/default.yaml
lm_eval/tasks/super_glue/boolq/default.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/cb/default.yaml
lm_eval/tasks/super_glue/cb/default.yaml
+1
-1
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/cb/t5_utils.py
lm_eval/tasks/super_glue/cb/t5_utils.py
+0
-2
lm_eval/tasks/super_glue/copa/default.yaml
lm_eval/tasks/super_glue/copa/default.yaml
+1
-1
No files found.
lm_eval/tasks/pubmedqa/pubmedqa.yaml
View file @
cda25fef
...
...
@@ -13,4 +13,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
View file @
cda25fef
...
...
@@ -19,4 +19,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/qasper/bool.yaml
View file @
cda25fef
...
...
@@ -11,4 +11,4 @@ doc_to_choice: ["no", "yes"]
metric_list
:
-
metric
:
f1
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/qasper/freeform.yaml
View file @
cda25fef
...
...
@@ -15,4 +15,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/qasper/utils.py
View file @
cda25fef
...
...
@@ -3,7 +3,6 @@ from functools import partial
def
process_docs
(
dataset
,
set_answer_type
=
"bool"
):
FEATURES
=
[
"title"
,
"abstract"
,
"question"
,
"answer"
,
"answer_type"
]
def
_categorise_answer
(
answer_blob
):
...
...
lm_eval/tasks/race/README.md
View file @
cda25fef
...
...
@@ -17,7 +17,25 @@ Homepage: https://www.cs.cmu.edu/~glai1/data/race/
### Citation
```
BibTeX-formatted citation goes here
@inproceedings{lai-etal-2017-race,
title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
author = "Lai, Guokun and
Xie, Qizhe and
Liu, Hanxiao and
Yang, Yiming and
Hovy, Eduard",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1082",
doi = "10.18653/v1/D17-1082",
pages = "785--794"
}
```
### Groups and Tasks
...
...
lm_eval/tasks/race/race.yaml
View file @
cda25fef
...
...
@@ -11,4 +11,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
View file @
cda25fef
...
...
@@ -14,4 +14,4 @@ generation_kwargs:
do_sample
:
false
temperature
:
0.0
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/sciq/sciq.yaml
View file @
cda25fef
...
...
@@ -18,4 +18,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/scrolls/task.py
View file @
cda25fef
...
...
@@ -235,7 +235,6 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
request_list
=
[
Instance
(
request_type
=
"loglikelihood"
,
...
...
lm_eval/tasks/siqa/
default
.yml
→
lm_eval/tasks/siqa/
siqa
.y
a
ml
View file @
cda25fef
...
...
@@ -6,11 +6,14 @@ training_split: train
validation_split
:
validation
doc_to_text
:
"
Q:
{{context}}
{{question}}
\n
A:"
target_delimiter
:
"
"
doc_to_choice
:
[
"
{{answerA}}"
,
"
{{answerB}}"
,
"
{{answerC}}"
]
doc_to_target
:
"
{{label}}"
doc_to_choice
:
-
"
{{answerA}}"
-
"
{{answerB}}"
-
"
{{answerC}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/squadv2/task.py
View file @
cda25fef
...
...
@@ -14,7 +14,6 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import
datasets
from
evaluate
import
load
from
math
import
exp
from
functools
import
partial
...
...
@@ -120,14 +119,14 @@ class SQuAD2(Task):
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
**
kwargs
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" "
+
"unanswerable"
),
idx
=
0
,
**
kwargs
**
kwargs
,
),
]
...
...
lm_eval/tasks/storycloze/storycloze_2016.yaml
View file @
cda25fef
...
...
@@ -15,4 +15,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/boolq/default.yaml
View file @
cda25fef
...
...
@@ -14,4 +14,4 @@ doc_to_decontamination_query: passage
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
View file @
cda25fef
...
...
@@ -23,4 +23,4 @@ metric_list:
ignore_case
:
true
ignore_punctuation
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
View file @
cda25fef
...
...
@@ -19,4 +19,4 @@ metric_list:
ignore_case
:
true
ignore_punctuation
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/cb/default.yaml
View file @
cda25fef
...
...
@@ -14,4 +14,4 @@ metric_list:
-
metric
:
f1
aggregation
:
!function
"
aggregate.cb_multi_fi"
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
View file @
cda25fef
...
...
@@ -22,4 +22,4 @@ metric_list:
aggregation
:
!function
"
t5_utils.agg_mean_3class_f1"
higher_is_better
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/cb/t5_utils.py
View file @
cda25fef
...
...
@@ -2,7 +2,6 @@ import sklearn.metrics
def
mean_3class_f1
(
predictions
,
references
):
# This is a passthrough function
string_label
=
[
"entailment"
,
"contradiction"
,
"neutral"
]
predictions
=
(
string_label
.
index
(
predictions
[
0
])
if
predictions
[
0
]
in
string_label
else
0
...
...
@@ -13,7 +12,6 @@ def mean_3class_f1(predictions, references): # This is a passthrough function
def
agg_mean_3class_f1
(
items
):
predictions
,
references
=
zip
(
*
items
)
"""Computes the unweighted average of the F1 per class."""
...
...
lm_eval/tasks/super_glue/copa/default.yaml
View file @
cda25fef
...
...
@@ -12,4 +12,4 @@ doc_to_choice: !function utils.doc_to_choice
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
1.0
version
:
1.0
Prev
1
…
5
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment