Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
0d1ef037
Commit
0d1ef037
authored
Jan 17, 2024
by
lintangsutawika
Browse files
solved merge conflict
parents
aa44be3f
ada4a31d
Changes
424
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
43 additions
and
29 deletions
+43
-29
lm_eval/tasks/race/README.md
lm_eval/tasks/race/README.md
+19
-1
lm_eval/tasks/race/race.yaml
lm_eval/tasks/race/race.yaml
+1
-1
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
+1
-1
lm_eval/tasks/sciq/sciq.yaml
lm_eval/tasks/sciq/sciq.yaml
+1
-1
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+1
-2
lm_eval/tasks/siqa/siqa.yaml
lm_eval/tasks/siqa/siqa.yaml
+6
-3
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+3
-4
lm_eval/tasks/storycloze/storycloze_2016.yaml
lm_eval/tasks/storycloze/storycloze_2016.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/default.yaml
lm_eval/tasks/super_glue/boolq/default.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+1
-1
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/cb/default.yaml
lm_eval/tasks/super_glue/cb/default.yaml
+1
-1
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/cb/t5_utils.py
lm_eval/tasks/super_glue/cb/t5_utils.py
+0
-2
lm_eval/tasks/super_glue/copa/default.yaml
lm_eval/tasks/super_glue/copa/default.yaml
+1
-1
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/multirc/default.yaml
lm_eval/tasks/super_glue/multirc/default.yaml
+1
-1
lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/multirc/t5_utils.py
lm_eval/tasks/super_glue/multirc/t5_utils.py
+0
-3
lm_eval/tasks/super_glue/record/default.yaml
lm_eval/tasks/super_glue/record/default.yaml
+1
-1
No files found.
lm_eval/tasks/race/README.md
View file @
0d1ef037
...
...
@@ -17,7 +17,25 @@ Homepage: https://www.cs.cmu.edu/~glai1/data/race/
### Citation
```
BibTeX-formatted citation goes here
@inproceedings{lai-etal-2017-race,
title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
author = "Lai, Guokun and
Xie, Qizhe and
Liu, Hanxiao and
Yang, Yiming and
Hovy, Eduard",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1082",
doi = "10.18653/v1/D17-1082",
pages = "785--794"
}
```
### Groups and Tasks
...
...
lm_eval/tasks/race/race.yaml
View file @
0d1ef037
...
...
@@ -11,4 +11,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
View file @
0d1ef037
...
...
@@ -14,4 +14,4 @@ generation_kwargs:
do_sample
:
false
temperature
:
0.0
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/sciq/sciq.yaml
View file @
0d1ef037
...
...
@@ -18,4 +18,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/scrolls/task.py
View file @
0d1ef037
...
...
@@ -108,7 +108,7 @@ def _num_cpu_cores():
class
_SCROLLSTask
(
Task
):
VERSION
=
1
VERSION
=
2
DATASET_PATH
=
"tau/scrolls"
DATASET_NAME
=
None
PRUNE_TOKENIZERS
=
None
...
...
@@ -235,7 +235,6 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
request_list
=
[
Instance
(
request_type
=
"loglikelihood"
,
...
...
lm_eval/tasks/siqa/
default
.yml
→
lm_eval/tasks/siqa/
siqa
.y
a
ml
View file @
0d1ef037
...
...
@@ -6,11 +6,14 @@ training_split: train
validation_split
:
validation
doc_to_text
:
"
Q:
{{context}}
{{question}}
\n
A:"
target_delimiter
:
"
"
doc_to_choice
:
[
"
{{answerA}}"
,
"
{{answerB}}"
,
"
{{answerC}}"
]
doc_to_target
:
"
{{label}}"
doc_to_choice
:
-
"
{{answerA}}"
-
"
{{answerB}}"
-
"
{{answerC}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/squadv2/task.py
View file @
0d1ef037
...
...
@@ -14,7 +14,6 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import
datasets
from
evaluate
import
load
from
math
import
exp
from
functools
import
partial
...
...
@@ -50,7 +49,7 @@ def _squad_agg(key, items):
@
register_task
(
"squadv2"
)
class
SQuAD2
(
Task
):
VERSION
=
2
VERSION
=
3
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
...
...
@@ -120,14 +119,14 @@ class SQuAD2(Task):
doc
=
doc
,
arguments
=
(
ctx
,
{
"until"
:
[
"
\n
"
]}),
idx
=
0
,
**
kwargs
**
kwargs
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" "
+
"unanswerable"
),
idx
=
0
,
**
kwargs
**
kwargs
,
),
]
...
...
lm_eval/tasks/storycloze/storycloze_2016.yaml
View file @
0d1ef037
...
...
@@ -15,4 +15,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/boolq/default.yaml
View file @
0d1ef037
...
...
@@ -14,4 +14,4 @@ doc_to_decontamination_query: passage
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
View file @
0d1ef037
...
...
@@ -23,4 +23,4 @@ metric_list:
ignore_case
:
true
ignore_punctuation
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
View file @
0d1ef037
...
...
@@ -19,4 +19,4 @@ metric_list:
ignore_case
:
true
ignore_punctuation
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/cb/default.yaml
View file @
0d1ef037
...
...
@@ -14,4 +14,4 @@ metric_list:
-
metric
:
f1
aggregation
:
!function
"
aggregate.cb_multi_fi"
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
View file @
0d1ef037
...
...
@@ -22,4 +22,4 @@ metric_list:
aggregation
:
!function
"
t5_utils.agg_mean_3class_f1"
higher_is_better
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/cb/t5_utils.py
View file @
0d1ef037
...
...
@@ -2,7 +2,6 @@ import sklearn.metrics
def
mean_3class_f1
(
predictions
,
references
):
# This is a passthrough function
string_label
=
[
"entailment"
,
"contradiction"
,
"neutral"
]
predictions
=
(
string_label
.
index
(
predictions
[
0
])
if
predictions
[
0
]
in
string_label
else
0
...
...
@@ -13,7 +12,6 @@ def mean_3class_f1(predictions, references): # This is a passthrough function
def
agg_mean_3class_f1
(
items
):
predictions
,
references
=
zip
(
*
items
)
"""Computes the unweighted average of the F1 per class."""
...
...
lm_eval/tasks/super_glue/copa/default.yaml
View file @
0d1ef037
...
...
@@ -12,4 +12,4 @@ doc_to_choice: !function utils.doc_to_choice
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
View file @
0d1ef037
...
...
@@ -19,4 +19,4 @@ metric_list:
ignore_case
:
true
ignore_punctuation
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/multirc/default.yaml
View file @
0d1ef037
...
...
@@ -12,4 +12,4 @@ doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\n
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
View file @
0d1ef037
...
...
@@ -20,4 +20,4 @@ metric_list:
aggregation
:
!function
t5_utils.agg_em
higher_is_better
:
true
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/multirc/t5_utils.py
View file @
0d1ef037
...
...
@@ -5,7 +5,6 @@ import sklearn.metrics
def
f1
(
predictions
,
references
):
# This is a passthrough function
_prediction
=
predictions
[
0
]
_reference
=
references
[
0
].
split
(
"_"
)[
-
1
]
string_label
=
[
"False"
,
"True"
]
...
...
@@ -20,7 +19,6 @@ def f1(predictions, references): # This is a passthrough function
def
agg_f1
(
items
):
predictions
,
references
=
zip
(
*
items
)
references
,
predictions
=
np
.
asarray
(
references
),
np
.
asarray
(
predictions
)
...
...
@@ -28,7 +26,6 @@ def agg_f1(items):
def
em
(
predictions
,
references
):
# This is a passthrough function
_prediction
=
predictions
[
0
]
_group
,
_reference
=
references
[
0
].
split
(
"_"
)
string_label
=
[
"False"
,
"True"
]
...
...
lm_eval/tasks/super_glue/record/default.yaml
View file @
0d1ef037
...
...
@@ -17,4 +17,4 @@ metric_list:
higher_is_better
:
True
aggregation
:
mean
metadata
:
-
version
:
1.0
version
:
1.0
Prev
1
…
14
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment