Commit e4db76cb authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into multimodal-prototyping

parents 6cc6e9cd ad80f555
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group:
- xwinograd
dataset_path: Muennighoff/xwinograd
dataset_name: null # Overridden by language-specific config.
output_type: multiple_choice
......
......@@ -308,7 +308,7 @@ class Reorderer:
return res
def make_table(result_dict, column: str = "results", sort_results: bool = True):
def make_table(result_dict, column: str = "results", sort_results: bool = False):
"""Generate table of results."""
from pytablewriter import LatexTableWriter, MarkdownTableWriter
......@@ -338,20 +338,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
keys = result_dict[column].keys()
if sort_results:
# sort entries alphabetically
# sort entries alphabetically by task or group name.
# NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
# sorting here would mess that up
keys = sorted(keys)
for k in keys:
dic = result_dict[column][k]
version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k])
version = result_dict["versions"].get(k, " N/A")
n = str(result_dict.get("n-shot", " ").get(k, " "))
higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
if "alias" in dic:
k = dic.pop("alias")
metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",")
......@@ -362,8 +363,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
if se != "N/A":
se = "%.4f" % se
se = " N/A" if se == "N/A" else "%.4f" % se
values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
else:
values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
......
......@@ -76,7 +76,6 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
unitxt = ["unitxt"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
......@@ -95,7 +94,6 @@ all = [
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
"lm_eval[unitxt]"
]
[tool.ruff.lint]
......
......@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
* `group_name`: `Short description`
#### Tags
* `tag_name`: `Short description`
#### Tasks
* `task_name`: `1-sentence description of what this particular task does`
......
......@@ -90,7 +90,7 @@ def test_evaluator(
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
["mmlu_stem"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
......
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|-------------|------:|------|-----:|--------|---|----:|---|------|
|arc_challenge| 1|none | 0|acc |↑ | 0.0|± | N/A|
| | |none | 0|acc_norm|↑ | 0.0|± | N/A|
|arc_easy | 1|none | 0|acc |↑ | 0.3|± | N/A|
| | |none | 0|acc_norm|↑ | 0.1|± | N/A|
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± | N/A|
| | |none | 0|perplexity|↓ |605.3866|± | N/A|
\ No newline at end of file
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
|-------------------------------|------:|------|-----:|------|---|-----:|---|------|
|stem | 1|none | |acc |↑ |0.2474|± | N/A|
| - abstract_algebra | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - anatomy | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - astronomy | 0|none | 0|acc |↑ |0.1000|± | N/A|
| - college_biology | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - college_chemistry | 0|none | 0|acc |↑ |0.1000|± | N/A|
| - college_computer_science | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - college_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - college_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - computer_security | 0|none | 0|acc |↑ |0.5000|± | N/A|
| - conceptual_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - electrical_engineering | 0|none | 0|acc |↑ |0.4000|± | N/A|
| - elementary_mathematics | 0|none | 0|acc |↑ |0.0000|± | N/A|
| - high_school_biology | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_chemistry | 0|none | 0|acc |↑ |0.4000|± | N/A|
| - high_school_computer_science| 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A|
| - high_school_physics | 0|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_statistics | 0|none | 0|acc |↑ |0.0000|± | N/A|
| - machine_learning | 0|none | 0|acc |↑ |0.3000|± | N/A|
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± | N/A|
| | |none | 0|byte_perplexity|↓ | 2.5304|± | N/A|
| | |none | 0|word_perplexity|↓ |130.4801|± | N/A|
\ No newline at end of file
group: test-1
group_alias: test 1
task:
- piqa # string task
- ai2_arc # string tag
- task: super-glue-lm-eval-v1 # Should this be spread out?
num_fewshot: 3
- task: swag # dict registered task
num_fewshot: 2
- task: mmlu
num_fewshot: 5
- group: nli-tasks # dict group
task:
- anli
- boolq
- sglue_rte
num_fewshot: 4
metric_list:
- metric: brier_score
- task: sciq # dict registered task duplicate
task_alias: sciq 2-shot
num_fewshot: 2
- task: sciq # dict registered task duplicate
task_alias: sciq 4-shot
num_fewshot: 4
- task: sciq # dict registered task duplicate
task_alias: sciq 6-shot
num_fewshot: 6
- task: siqa_custom # dict task
dataset_path: social_i_qa
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
target_delimiter: " "
doc_to_choice:
- "{{answerA}}"
- "{{answerB}}"
- "{{answerC}}"
doc_to_target: "{{ (label|int) - 1 }}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment