Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3d1b8f43
Unverified
Commit
3d1b8f43
authored
Jul 03, 2024
by
Lintang Sutawika
Committed by
GitHub
Jul 03, 2024
Browse files
Merge branch 'main' into group-agg-rework
parents
e200c24e
d855d0ba
Changes
317
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
81 additions
and
15 deletions
+81
-15
lm_eval/tasks/paloma/paloma_dolma-v1_5.yaml
lm_eval/tasks/paloma/paloma_dolma-v1_5.yaml
+4
-0
lm_eval/tasks/paloma/paloma_dolma_100_programing_languages.yaml
...l/tasks/paloma/paloma_dolma_100_programing_languages.yaml
+4
-0
lm_eval/tasks/paloma/paloma_dolma_100_subreddits.yaml
lm_eval/tasks/paloma/paloma_dolma_100_subreddits.yaml
+4
-0
lm_eval/tasks/paloma/paloma_falcon-refinedweb.yaml
lm_eval/tasks/paloma/paloma_falcon-refinedweb.yaml
+4
-0
lm_eval/tasks/paloma/paloma_gab.yaml
lm_eval/tasks/paloma/paloma_gab.yaml
+4
-0
lm_eval/tasks/paloma/paloma_m2d2_s2orc_unsplit.yaml
lm_eval/tasks/paloma/paloma_m2d2_s2orc_unsplit.yaml
+4
-0
lm_eval/tasks/paloma/paloma_m2d2_wikipedia_unsplit.yaml
lm_eval/tasks/paloma/paloma_m2d2_wikipedia_unsplit.yaml
+4
-0
lm_eval/tasks/paloma/paloma_manosphere_meta_sep.yaml
lm_eval/tasks/paloma/paloma_manosphere_meta_sep.yaml
+4
-0
lm_eval/tasks/paloma/paloma_mc4.yaml
lm_eval/tasks/paloma/paloma_mc4.yaml
+4
-0
lm_eval/tasks/paloma/paloma_ptb.yaml
lm_eval/tasks/paloma/paloma_ptb.yaml
+4
-0
lm_eval/tasks/paloma/paloma_redpajama.yaml
lm_eval/tasks/paloma/paloma_redpajama.yaml
+4
-0
lm_eval/tasks/paloma/paloma_twitterAAE_HELM_fixed.yaml
lm_eval/tasks/paloma/paloma_twitterAAE_HELM_fixed.yaml
+4
-0
lm_eval/tasks/paloma/paloma_utils.py
lm_eval/tasks/paloma/paloma_utils.py
+2
-0
lm_eval/tasks/paloma/paloma_wikitext_103.yaml
lm_eval/tasks/paloma/paloma_wikitext_103.yaml
+4
-0
lm_eval/tasks/piqa/piqa.yaml
lm_eval/tasks/piqa/piqa.yaml
+2
-0
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+22
-8
lm_eval/tasks/siqa/siqa.yaml
lm_eval/tasks/siqa/siqa.yaml
+1
-4
lm_eval/tasks/squad_completion/task.py
lm_eval/tasks/squad_completion/task.py
+0
-2
lm_eval/tasks/squadv2/task.py
lm_eval/tasks/squadv2/task.py
+1
-0
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
+1
-1
No files found.
lm_eval/tasks/paloma/paloma_dolma-v1_5.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_dolma-v1_5
task_alias
:
Dolma V1.5
dataset_name
:
dolma-v1_5
lm_eval/tasks/paloma/paloma_dolma_100_programing_languages.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_dolma_100_programing_languages
task_alias
:
100 PLs
dataset_name
:
dolma_100_programing_languages
lm_eval/tasks/paloma/paloma_dolma_100_subreddits.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_dolma_100_subreddits
task_alias
:
100 Subreddits
dataset_name
:
dolma_100_subreddits
lm_eval/tasks/paloma/paloma_falcon-refinedweb.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_falcon-refinedweb
task_alias
:
Falcon
dataset_name
:
falcon-refinedweb
lm_eval/tasks/paloma/paloma_gab.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_gab
task_alias
:
Gab
dataset_name
:
gab
lm_eval/tasks/paloma/paloma_m2d2_s2orc_unsplit.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_m2d2_s2orc_unsplit
task_alias
:
M2D2 S2ORC
dataset_name
:
m2d2_s2orc_unsplit
lm_eval/tasks/paloma/paloma_m2d2_wikipedia_unsplit.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_m2d2_wikipedia_unsplit
task_alias
:
M2D2 Wikipedia
dataset_name
:
m2d2_wikipedia_unsplit
lm_eval/tasks/paloma/paloma_manosphere_meta_sep.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_manosphere_meta_sep
task_alias
:
Manosphere
dataset_name
:
manosphere_meta_sep
lm_eval/tasks/paloma/paloma_mc4.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_mc4
task_alias
:
mC4
dataset_name
:
mc4
lm_eval/tasks/paloma/paloma_ptb.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_ptb
task_alias
:
PTB
dataset_name
:
ptb
lm_eval/tasks/paloma/paloma_redpajama.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_redpajama
task_alias
:
RedPajama
dataset_name
:
redpajama
lm_eval/tasks/paloma/paloma_twitterAAE_HELM_fixed.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_twitterAAE_HELM_fixed
task_alias
:
Twitter AAE
dataset_name
:
twitterAAE_HELM_fixed
lm_eval/tasks/paloma/paloma_utils.py
0 → 100644
View file @
3d1b8f43
def
doc_to_target
(
doc
):
return
str
(
doc
[
"text"
])
lm_eval/tasks/paloma/paloma_wikitext_103.yaml
0 → 100644
View file @
3d1b8f43
include
:
_paloma_template
task
:
paloma_wikitext_103
task_alias
:
Wikitext-103
dataset_name
:
wikitext_103
lm_eval/tasks/piqa/piqa.yaml
View file @
3d1b8f43
...
@@ -19,3 +19,5 @@ metric_list:
...
@@ -19,3 +19,5 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/scrolls/task.py
View file @
3d1b8f43
...
@@ -4,12 +4,12 @@ from functools import reduce
...
@@ -4,12 +4,12 @@ from functools import reduce
import
numpy
as
np
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
datasets
import
load_metric
from
datasets
import
Dataset
,
load_metric
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.metrics
import
mean
from
lm_eval.api.task
import
Task
from
lm_eval.api.task
import
Configurable
Task
_CITATION
=
"""
_CITATION
=
"""
...
@@ -108,7 +108,7 @@ def _num_cpu_cores():
...
@@ -108,7 +108,7 @@ def _num_cpu_cores():
return
len
(
os
.
sched_getaffinity
(
0
))
return
len
(
os
.
sched_getaffinity
(
0
))
class
_SCROLLSTask
(
Task
):
class
_SCROLLSTask
(
Configurable
Task
):
VERSION
=
2
VERSION
=
2
DATASET_PATH
=
"tau/scrolls"
DATASET_PATH
=
"tau/scrolls"
DATASET_NAME
=
None
DATASET_NAME
=
None
...
@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
...
@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
PRUNE_NUM_PROC
=
None
PRUNE_NUM_PROC
=
None
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
(
config
=
{
"metadata"
:
{
"version"
:
self
.
VERSION
}}
)
if
self
.
DATASET_NAME
is
not
None
:
if
self
.
DATASET_NAME
is
not
None
:
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
self
.
metric
=
load_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
...
@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
...
@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
return
False
return
False
def
training_docs
(
self
):
def
training_docs
(
self
):
for
doc
in
self
.
dataset
[
"train"
]:
processed_docs
=
list
(
map
(
self
.
_process_doc
,
self
.
dataset
[
"train"
]))
yield
from
self
.
_process_doc
(
doc
)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs
=
[
item
for
sublist
in
processed_docs
for
item
in
sublist
]
processed_dict
=
{
key
:
[
d
[
key
]
for
d
in
processed_docs
]
for
key
in
processed_docs
[
0
]
}
return
Dataset
.
from_dict
(
processed_dict
)
def
validation_docs
(
self
):
def
validation_docs
(
self
):
for
doc
in
self
.
dataset
[
"validation"
]:
processed_docs
=
list
(
map
(
self
.
_process_doc
,
self
.
dataset
[
"validation"
]))
yield
from
self
.
_process_doc
(
doc
)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs
=
[
item
for
sublist
in
processed_docs
for
item
in
sublist
]
processed_dict
=
{
key
:
[
d
[
key
]
for
d
in
processed_docs
]
for
key
in
processed_docs
[
0
]
}
return
Dataset
.
from_dict
(
processed_dict
)
def
should_decontaminate
(
self
):
def
should_decontaminate
(
self
):
return
True
return
True
...
...
lm_eval/tasks/siqa/siqa.yaml
View file @
3d1b8f43
...
@@ -6,10 +6,7 @@ training_split: train
...
@@ -6,10 +6,7 @@ training_split: train
validation_split
:
validation
validation_split
:
validation
doc_to_text
:
"
Q:
{{context}}
{{question}}
\n
A:"
doc_to_text
:
"
Q:
{{context}}
{{question}}
\n
A:"
target_delimiter
:
"
"
target_delimiter
:
"
"
doc_to_choice
:
doc_to_choice
:
"
{{[answerA,
answerB,
answerC]}}"
-
"
{{answerA}}"
-
"
{{answerB}}"
-
"
{{answerC}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
metric_list
:
metric_list
:
-
metric
:
acc
-
metric
:
acc
...
...
lm_eval/tasks/squad_completion/task.py
View file @
3d1b8f43
"""
"""
import
re
import
re
from
typing
import
List
from
typing
import
List
...
...
lm_eval/tasks/squadv2/task.py
View file @
3d1b8f43
...
@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
...
@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
"""
from
functools
import
partial
from
functools
import
partial
from
math
import
exp
from
math
import
exp
...
...
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
View file @
3d1b8f43
"""
This code mirrors the utils of the original winogrande task
"""
"""This code mirrors the utils of the original winogrande task"""
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
...
...
Prev
1
…
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment