Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
0d1ef037
"test/vscode:/vscode.git/clone" did not exist on "fd6323c605c0571a161edad2081e7024d91d61a7"
Commit
0d1ef037
authored
Jan 17, 2024
by
lintangsutawika
Browse files
solved merge conflict
parents
aa44be3f
ada4a31d
Changes
424
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
17 additions
and
29 deletions
+17
-29
lm_eval/tasks/super_glue/record/t5-prompt.yaml
lm_eval/tasks/super_glue/record/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/record/t5_utils.py
lm_eval/tasks/super_glue/record/t5_utils.py
+1
-4
lm_eval/tasks/super_glue/rte/default.yaml
lm_eval/tasks/super_glue/rte/default.yaml
+1
-1
lm_eval/tasks/super_glue/rte/t5-prompt.yaml
lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/wic/default.yaml
lm_eval/tasks/super_glue/wic/default.yaml
+1
-1
lm_eval/tasks/super_glue/wic/t5-prompt.yaml
lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/wsc/default.yaml
lm_eval/tasks/super_glue/wsc/default.yaml
+1
-1
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+1
-1
lm_eval/tasks/super_glue/wsc/t5_utils.py
lm_eval/tasks/super_glue/wsc/t5_utils.py
+0
-4
lm_eval/tasks/swag/swag.yaml
lm_eval/tasks/swag/swag.yaml
+1
-1
lm_eval/tasks/toxigen/toxigen.yaml
lm_eval/tasks/toxigen/toxigen.yaml
+1
-1
lm_eval/tasks/translation/utils.py
lm_eval/tasks/translation/utils.py
+0
-2
lm_eval/tasks/translation/wmt_common_yaml
lm_eval/tasks/translation/wmt_common_yaml
+1
-1
lm_eval/tasks/triviaqa/default.yaml
lm_eval/tasks/triviaqa/default.yaml
+1
-1
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+1
-1
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+1
-1
lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
+1
-1
lm_eval/tasks/truthfulqa/utils.py
lm_eval/tasks/truthfulqa/utils.py
+0
-3
lm_eval/tasks/unscramble/anagrams1.yaml
lm_eval/tasks/unscramble/anagrams1.yaml
+1
-1
lm_eval/tasks/unscramble/anagrams2.yaml
lm_eval/tasks/unscramble/anagrams2.yaml
+1
-1
No files found.
lm_eval/tasks/super_glue/record/t5-prompt.yaml
View file @
0d1ef037
...
@@ -19,4 +19,4 @@ metric_list:
...
@@ -19,4 +19,4 @@ metric_list:
aggregation
:
!function
t5_utils.squad_f1_agg
aggregation
:
!function
t5_utils.squad_f1_agg
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/record/t5_utils.py
View file @
0d1ef037
...
@@ -3,14 +3,12 @@ import string
...
@@ -3,14 +3,12 @@ import string
import
collections
import
collections
import
numpy
as
np
import
numpy
as
np
from
tqdm
import
tqdm
from
datasets
import
Dataset
from
datasets
import
Dataset
,
concatenate_datasets
from
lm_eval.api.metrics
import
metric_max_over_ground_truths
from
lm_eval.api.metrics
import
metric_max_over_ground_truths
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
passage
=
doc
[
"passage"
]
passage
=
doc
[
"passage"
]
passage
=
re
.
sub
(
r
"(\.|\?|\!|\"|\')\n@highlight\n"
,
r
"\1 "
,
passage
)
passage
=
re
.
sub
(
r
"(\.|\?|\!|\"|\')\n@highlight\n"
,
r
"\1 "
,
passage
)
passage
=
re
.
sub
(
r
"\n@highlight\n"
,
". "
,
passage
)
passage
=
re
.
sub
(
r
"\n@highlight\n"
,
". "
,
passage
)
...
@@ -34,7 +32,6 @@ def process_docs(dataset):
...
@@ -34,7 +32,6 @@ def process_docs(dataset):
}
}
answers
=
doc
.
pop
(
"answers"
)
answers
=
doc
.
pop
(
"answers"
)
for
idx
,
answer
in
enumerate
(
answers
):
for
idx
,
answer
in
enumerate
(
answers
):
for
key
in
split_doc
.
keys
():
for
key
in
split_doc
.
keys
():
if
key
in
doc
:
if
key
in
doc
:
split_doc
[
key
].
append
(
doc
[
key
])
split_doc
[
key
].
append
(
doc
[
key
])
...
...
lm_eval/tasks/super_glue/rte/default.yaml
View file @
0d1ef037
...
@@ -12,4 +12,4 @@ doc_to_choice: ['True', 'False']
...
@@ -12,4 +12,4 @@ doc_to_choice: ['True', 'False']
metric_list
:
metric_list
:
-
metric
:
acc
-
metric
:
acc
metadata
:
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/rte/t5-prompt.yaml
View file @
0d1ef037
...
@@ -19,4 +19,4 @@ metric_list:
...
@@ -19,4 +19,4 @@ metric_list:
ignore_case
:
true
ignore_case
:
true
ignore_punctuation
:
true
ignore_punctuation
:
true
metadata
:
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/wic/default.yaml
View file @
0d1ef037
...
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
...
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
metric_list
:
metric_list
:
-
metric
:
acc
-
metric
:
acc
metadata
:
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/wic/t5-prompt.yaml
View file @
0d1ef037
...
@@ -19,4 +19,4 @@ metric_list:
...
@@ -19,4 +19,4 @@ metric_list:
ignore_case
:
true
ignore_case
:
true
ignore_punctuation
:
true
ignore_punctuation
:
true
metadata
:
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/wsc/default.yaml
View file @
0d1ef037
...
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
...
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
metric_list
:
metric_list
:
-
metric
:
acc
-
metric
:
acc
metadata
:
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
View file @
0d1ef037
...
@@ -20,4 +20,4 @@ filter_list:
...
@@ -20,4 +20,4 @@ filter_list:
filter
:
filter
:
-
function
:
!function
t5_utils.WSCPostprocess
-
function
:
!function
t5_utils.WSCPostprocess
metadata
:
metadata
:
-
version
:
0.0
version
:
0.0
lm_eval/tasks/super_glue/wsc/t5_utils.py
View file @
0d1ef037
...
@@ -8,7 +8,6 @@ def doc_to_text(x):
...
@@ -8,7 +8,6 @@ def doc_to_text(x):
def
_wsc_inputs
(
x
):
def
_wsc_inputs
(
x
):
words
=
x
[
"text"
].
split
(
" "
)
words
=
x
[
"text"
].
split
(
" "
)
# We would need some special logic to handle the case where the pronoun is the
# We would need some special logic to handle the case where the pronoun is the
...
@@ -55,7 +54,6 @@ def _wsc_inputs(x):
...
@@ -55,7 +54,6 @@ def _wsc_inputs(x):
class
WSCPostprocess
(
Filter
):
class
WSCPostprocess
(
Filter
):
def
__init__
(
self
,
**
kwargs
):
def
__init__
(
self
,
**
kwargs
):
self
.
determiners
=
{
self
.
determiners
=
{
"a"
,
"a"
,
"an"
,
"an"
,
...
@@ -86,10 +84,8 @@ class WSCPostprocess(Filter):
...
@@ -86,10 +84,8 @@ class WSCPostprocess(Filter):
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
self
.
determiners
])
return
" "
.
join
([
w
for
w
in
s
.
split
(
" "
)
if
w
not
in
self
.
determiners
])
def
apply
(
self
,
resps
,
docs
):
def
apply
(
self
,
resps
,
docs
):
filtered_resps
=
[]
filtered_resps
=
[]
for
prediction
,
reference
in
zip
(
*
(
resps
,
docs
[
"span1_text"
])):
for
prediction
,
reference
in
zip
(
*
(
resps
,
docs
[
"span1_text"
])):
prediction
=
self
.
clean
(
prediction
[
0
])
prediction
=
self
.
clean
(
prediction
[
0
])
reference
=
self
.
clean
(
reference
)
reference
=
self
.
clean
(
reference
)
...
...
lm_eval/tasks/swag/swag.yaml
View file @
0d1ef037
...
@@ -16,4 +16,4 @@ metric_list:
...
@@ -16,4 +16,4 @@ metric_list:
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/toxigen/toxigen.yaml
View file @
0d1ef037
...
@@ -15,4 +15,4 @@ metric_list:
...
@@ -15,4 +15,4 @@ metric_list:
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/translation/utils.py
View file @
0d1ef037
import
argparse
import
argparse
from
typing
import
Dict
,
List
import
yaml
import
yaml
import
sacrebleu
try
:
try
:
import
pycountry
import
pycountry
...
...
lm_eval/tasks/translation/wmt_common_yaml
View file @
0d1ef037
...
@@ -14,4 +14,4 @@ generation_kwargs:
...
@@ -14,4 +14,4 @@ generation_kwargs:
temperature: 0.0
temperature: 0.0
repeats: 1
repeats: 1
metadata:
metadata:
-
version:
0
.0
version:
1
.0
lm_eval/tasks/triviaqa/default.yaml
View file @
0d1ef037
...
@@ -28,4 +28,4 @@ metric_list:
...
@@ -28,4 +28,4 @@ metric_list:
ignore_case
:
true
ignore_case
:
true
ignore_punctuation
:
true
ignore_punctuation
:
true
metadata
:
metadata
:
-
version
:
2
.0
version
:
3
.0
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
View file @
0d1ef037
...
@@ -76,4 +76,4 @@ metric_list:
...
@@ -76,4 +76,4 @@ metric_list:
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
2
.0
version
:
3
.0
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
View file @
0d1ef037
...
@@ -33,4 +33,4 @@ metric_list:
...
@@ -33,4 +33,4 @@ metric_list:
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
View file @
0d1ef037
...
@@ -10,4 +10,4 @@ metric_list:
...
@@ -10,4 +10,4 @@ metric_list:
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
-
version
:
2.0
version
:
2.0
lm_eval/tasks/truthfulqa/utils.py
View file @
0d1ef037
...
@@ -6,7 +6,6 @@ from rouge_score import rouge_scorer, scoring
...
@@ -6,7 +6,6 @@ from rouge_score import rouge_scorer, scoring
def
process_results_mc2
(
doc
,
results
):
def
process_results_mc2
(
doc
,
results
):
lls
,
is_greedy
=
zip
(
*
results
)
lls
,
is_greedy
=
zip
(
*
results
)
# Split on the first `0` as everything before it is true (`1`).
# Split on the first `0` as everything before it is true (`1`).
...
@@ -20,7 +19,6 @@ def process_results_mc2(doc, results):
...
@@ -20,7 +19,6 @@ def process_results_mc2(doc, results):
def
process_docs_gen
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
def
process_docs_gen
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
return
dataset
.
map
(
preprocess_function
)
return
dataset
.
map
(
preprocess_function
)
...
@@ -49,7 +47,6 @@ def preprocess_function(examples):
...
@@ -49,7 +47,6 @@ def preprocess_function(examples):
def
process_results_gen
(
doc
,
results
):
def
process_results_gen
(
doc
,
results
):
completion
=
results
[
0
]
completion
=
results
[
0
]
true_refs
,
false_refs
=
doc
[
"correct_answers"
],
doc
[
"incorrect_answers"
]
true_refs
,
false_refs
=
doc
[
"correct_answers"
],
doc
[
"incorrect_answers"
]
all_refs
=
true_refs
+
false_refs
all_refs
=
true_refs
+
false_refs
...
...
lm_eval/tasks/unscramble/anagrams1.yaml
View file @
0d1ef037
...
@@ -17,4 +17,4 @@ metric_list:
...
@@ -17,4 +17,4 @@ metric_list:
ignore_case
:
false
ignore_case
:
false
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
-
version
:
1
.0
version
:
2
.0
lm_eval/tasks/unscramble/anagrams2.yaml
View file @
0d1ef037
...
@@ -17,4 +17,4 @@ metric_list:
...
@@ -17,4 +17,4 @@ metric_list:
ignore_case
:
false
ignore_case
:
false
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
-
version
:
1
.0
version
:
2
.0
Prev
1
…
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment