Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e4db76cb
Commit
e4db76cb
authored
Jul 09, 2024
by
haileyschoelkopf
Browse files
Merge branch 'main' into multimodal-prototyping
parents
6cc6e9cd
ad80f555
Changes
871
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
92 additions
and
32 deletions
+92
-32
lm_eval/tasks/xwinograd/xwinograd_common_yaml
lm_eval/tasks/xwinograd/xwinograd_common_yaml
+0
-2
lm_eval/utils.py
lm_eval/utils.py
+8
-8
pyproject.toml
pyproject.toml
+0
-2
templates/new_yaml_task/README.md
templates/new_yaml_task/README.md
+5
-1
tests/test_evaluator.py
tests/test_evaluator.py
+1
-1
tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+6
-8
tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+2
-2
tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+0
-5
tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+22
-0
tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+3
-3
tests/testyamls/test-01.yaml
tests/testyamls/test-01.yaml
+45
-0
No files found.
lm_eval/tasks/xwinograd/xwinograd_common_yaml
View file @
e4db76cb
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group:
- xwinograd
dataset_path: Muennighoff/xwinograd
dataset_name: null # Overridden by language-specific config.
output_type: multiple_choice
...
...
lm_eval/utils.py
View file @
e4db76cb
...
...
@@ -308,7 +308,7 @@ class Reorderer:
return
res
def
make_table
(
result_dict
,
column
:
str
=
"results"
,
sort_results
:
bool
=
Tru
e
):
def
make_table
(
result_dict
,
column
:
str
=
"results"
,
sort_results
:
bool
=
Fals
e
):
"""Generate table of results."""
from
pytablewriter
import
LatexTableWriter
,
MarkdownTableWriter
...
...
@@ -338,20 +338,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
keys
=
result_dict
[
column
].
keys
()
if
sort_results
:
# sort entries alphabetically
# sort entries alphabetically by task or group name.
# NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
# sorting here would mess that up
keys
=
sorted
(
keys
)
for
k
in
keys
:
dic
=
result_dict
[
column
][
k
]
version
=
result_dict
[
"versions"
].
get
(
k
,
"N/A"
)
n
=
str
(
result_dict
[
"n-shot"
][
k
]
)
version
=
result_dict
[
"versions"
].
get
(
k
,
"
N/A"
)
n
=
str
(
result_dict
.
get
(
"n-shot"
,
" "
).
get
(
k
,
" "
)
)
higher_is_better
=
result_dict
.
get
(
"higher_is_better"
,
{}).
get
(
k
,
{})
if
"alias"
in
dic
:
k
=
dic
.
pop
(
"alias"
)
metric_items
=
dic
.
items
()
if
sort_results
:
metric_items
=
sorted
(
metric_items
)
metric_items
=
sorted
(
metric_items
)
for
(
mf
),
v
in
metric_items
:
m
,
_
,
f
=
mf
.
partition
(
","
)
...
...
@@ -362,8 +363,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
if
m
+
"_stderr"
+
","
+
f
in
dic
:
se
=
dic
[
m
+
"_stderr"
+
","
+
f
]
if
se
!=
"N/A"
:
se
=
"%.4f"
%
se
se
=
" N/A"
if
se
==
"N/A"
else
"%.4f"
%
se
values
.
append
([
k
,
version
,
f
,
n
,
m
,
hib
,
"%.4f"
%
v
,
"±"
,
se
])
else
:
values
.
append
([
k
,
version
,
f
,
n
,
m
,
hib
,
"%.4f"
%
v
,
""
,
""
])
...
...
pyproject.toml
View file @
e4db76cb
...
...
@@ -76,7 +76,6 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm
=
["vllm>=0.4.2"]
zeno
=
[
"pandas"
,
"zeno-client"
]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
unitxt
=
["unitxt"]
all
=
[
"lm_eval[anthropic]"
,
"lm_eval[dev]"
,
...
...
@@ -95,7 +94,6 @@ all = [
"lm_eval[vllm]"
,
"lm_eval[zeno]"
,
"lm_eval[wandb]"
,
"lm_eval[unitxt]"
]
[tool.ruff.lint]
...
...
templates/new_yaml_task/README.md
View file @
e4db76cb
...
...
@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here
```
### Groups and Tasks
### Groups
, Tags,
and Tasks
#### Groups
*
`group_name`
:
`Short description`
#### Tags
*
`tag_name`
:
`Short description`
#### Tasks
*
`task_name`
:
`1-sentence description of what this particular task does`
...
...
tests/test_evaluator.py
View file @
e4db76cb
...
...
@@ -90,7 +90,7 @@ def test_evaluator(
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
),
(
[
"mmlu_
abstract_algebra"
,
"mmlu_global_facts"
,
"mmlu_public_relations
"
],
[
"mmlu_
stem
"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
...
...
tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
View file @
e4db76cb
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|-------------|------:|------|-----:|--------|---|----:|---|------|
|arc_challenge| 1|none | 0|acc |↑ | 0.0|± | N/A|
| | |none | 0|acc_norm|↑ | 0.0|± | N/A|
|arc_easy | 1|none | 0|acc |↑ | 0.3|± | N/A|
| | |none | 0|acc_norm|↑ | 0.1|± | N/A|
\ No newline at end of file
tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
View file @
e4db76cb
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± | N/A|
| | |none | 0|perplexity|↓ |605.3866|± | N/A|
\ No newline at end of file
tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
deleted
100644 → 0
View file @
6cc6e9cd
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
0 → 100644
View file @
e4db76cb
| Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
|-------------------------------|------:|------|-----:|------|---|-----:|---|------|
|stem | 1|none | |acc |↑ |0.2474|± | N/A|
| - abstract_algebra | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - anatomy | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - astronomy | 0|none | 0|acc |↑ |0.1000|± | N/A|
| - college_biology | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - college_chemistry | 0|none | 0|acc |↑ |0.1000|± | N/A|
| - college_computer_science | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - college_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - college_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - computer_security | 0|none | 0|acc |↑ |0.5000|± | N/A|
| - conceptual_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - electrical_engineering | 0|none | 0|acc |↑ |0.4000|± | N/A|
| - elementary_mathematics | 0|none | 0|acc |↑ |0.0000|± | N/A|
| - high_school_biology | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_chemistry | 0|none | 0|acc |↑ |0.4000|± | N/A|
| - high_school_computer_science| 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - high_school_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_statistics | 0|none | 0|acc |↑ |0.0000|± | N/A|
| - machine_learning | 0|none | 0|acc |↑ |0.3000|± | N/A|
\ No newline at end of file
tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
View file @
e4db76cb
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± | N/A|
| | |none | 0|byte_perplexity|↓ | 2.5304|± | N/A|
| | |none | 0|word_perplexity|↓ |130.4801|± | N/A|
\ No newline at end of file
tests/testyamls/test-01.yaml
0 → 100644
View file @
e4db76cb
group
:
test-1
group_alias
:
test
1
task
:
-
piqa
# string task
-
ai2_arc
# string tag
-
task
:
super-glue-lm-eval-v1
# Should this be spread out?
num_fewshot
:
3
-
task
:
swag
# dict registered task
num_fewshot
:
2
-
task
:
mmlu
num_fewshot
:
5
-
group
:
nli-tasks
# dict group
task
:
-
anli
-
boolq
-
sglue_rte
num_fewshot
:
4
metric_list
:
-
metric
:
brier_score
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 2-shot
num_fewshot
:
2
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 4-shot
num_fewshot
:
4
-
task
:
sciq
# dict registered task duplicate
task_alias
:
sciq 6-shot
num_fewshot
:
6
-
task
:
siqa_custom
# dict task
dataset_path
:
social_i_qa
dataset_name
:
null
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
Question:
{{context}}
{{question}}
\n
Answer:"
target_delimiter
:
"
"
doc_to_choice
:
-
"
{{answerA}}"
-
"
{{answerB}}"
-
"
{{answerC}}"
doc_to_target
:
"
{{
(label|int)
-
1
}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
Prev
1
…
40
41
42
43
44
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment