Commit 7d6ec4d9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	lm_eval/__init__.py
#	pyproject.toml
parents 1020c46e d021bf84
"dataset_name": "professional_psychology" "dataset_name": "professional_psychology"
"description": "The following are questions (with answers) about professional\ "description": "The following are questions (with answers) about professional\
\ psychology.\n\n" \ psychology.\n\n"
"tag": "mmlu_continuation_social_sciences" "tag": "mmlu_social_sciences_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_professional_psychology" "task": "mmlu_professional_psychology_continuation"
"dataset_name": "public_relations" "dataset_name": "public_relations"
"description": "The following are questions (with answers) about public\ "description": "The following are questions (with answers) about public\
\ relations.\n\n" \ relations.\n\n"
"tag": "mmlu_continuation_social_sciences" "tag": "mmlu_social_sciences_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_public_relations" "task": "mmlu_public_relations_continuation"
"dataset_name": "security_studies" "dataset_name": "security_studies"
"description": "The following are questions (with answers) about security\ "description": "The following are questions (with answers) about security\
\ studies.\n\n" \ studies.\n\n"
"tag": "mmlu_continuation_social_sciences" "tag": "mmlu_social_sciences_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_security_studies" "task": "mmlu_security_studies_continuation"
"dataset_name": "sociology" "dataset_name": "sociology"
"description": "The following are questions (with answers) about sociology.\n\ "description": "The following are questions (with answers) about sociology.\n\
\n" \n"
"tag": "mmlu_continuation_social_sciences" "tag": "mmlu_social_sciences_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_sociology" "task": "mmlu_sociology_continuation"
"dataset_name": "us_foreign_policy" "dataset_name": "us_foreign_policy"
"description": "The following are questions (with answers) about us\ "description": "The following are questions (with answers) about us\
\ foreign policy.\n\n" \ foreign policy.\n\n"
"tag": "mmlu_continuation_social_sciences" "tag": "mmlu_social_sciences_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_us_foreign_policy" "task": "mmlu_us_foreign_policy_continuation"
"dataset_name": "virology" "dataset_name": "virology"
"description": "The following are questions (with answers) about virology.\n\ "description": "The following are questions (with answers) about virology.\n\
\n" \n"
"tag": "mmlu_continuation_other" "tag": "mmlu_other_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_virology" "task": "mmlu_virology_continuation"
"dataset_name": "world_religions" "dataset_name": "world_religions"
"description": "The following are questions (with answers) about world\ "description": "The following are questions (with answers) about world\
\ religions.\n\n" \ religions.\n\n"
"tag": "mmlu_continuation_humanities" "tag": "mmlu_humanities_continuation"
"include": "_continuation_template_yaml" "include": "_continuation_template_yaml"
"task": "mmlu_continuation_world_religions" "task": "mmlu_world_religions_continuation"
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split dataset_path: cais/mmlu
validation_split: validation validation_split: validation
test_split: test test_split: test
fewshot_config: fewshot_config:
......
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split dataset_path: cais/mmlu
validation_split: validation validation_split: validation
fewshot_split: dev fewshot_split: dev
output_type: generate_until output_type: generate_until
......
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split dataset_path: cais/mmlu
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
fewshot_config: fewshot_config:
......
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split dataset_path: cais/mmlu
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
fewshot_config: fewshot_config:
......
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split dataset_path: cais/mmlu
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
fewshot_config: fewshot_config:
......
...@@ -3,6 +3,8 @@ build-backend = "setuptools.build_meta" ...@@ -3,6 +3,8 @@ build-backend = "setuptools.build_meta"
requires = ["setuptools>=40.8.0", "wheel"] requires = ["setuptools>=40.8.0", "wheel"]
[project] [project]
name = "lm_eval"
version = "0.4.9.1"
authors = [ authors = [
{email = "contact@eleuther.ai", name = "EleutherAI"} {email = "contact@eleuther.ai", name = "EleutherAI"}
] ]
......
import os import os
import pytest
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks from lm_eval import tasks
@pytest.mark.parametrize( def test_include_path_precedence():
"limit,model,model_args", """Test that user-specified include paths take precedence over default paths when tasks have the same name."""
[ import tempfile
(
10, # Create a temporary directory for our custom task
"hf", with tempfile.TemporaryDirectory() as custom_dir:
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu", # Create a custom arc_easy.yaml that has a different metric
), custom_task_content = """task: arc_easy
], dataset_path: allenai/ai2_arc
) dataset_name: ARC-Easy
def test_include_correctness(limit: int, model: str, model_args: str): output_type: multiple_choice
task_name = ["arc_easy"] training_split: train
validation_split: validation
task_manager = tasks.TaskManager() test_split: test
task_dict = tasks.get_task_dict(task_name, task_manager) doc_to_text: "Custom Question: {{question}}\\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
e1 = evaluator.simple_evaluate( doc_to_choice: "{{choices.text}}"
model=model, metric_list:
tasks=task_name, - metric: f1
limit=limit, aggregation: mean
model_args=model_args, higher_is_better: true
metadata:
version: 2.0
custom: true
"""
# Write the custom task file
custom_task_path = os.path.join(custom_dir, "arc_easy.yaml")
with open(custom_task_path, "w") as f:
f.write(custom_task_content)
# Test 1: User path should override default when include_defaults=True
task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)
# Load the task
task_dict = task_manager.load_task_or_group(["arc_easy"])
arc_easy_task = task_dict["arc_easy"]
# Check that the custom version was loaded (has f1 metric and custom doc_to_text)
assert any(
metric["metric"] == "f1" for metric in arc_easy_task.config["metric_list"]
), "Custom task should have f1 metric"
assert "Custom Question:" in arc_easy_task.config["doc_to_text"], (
"Custom task should have custom doc_to_text"
)
assert arc_easy_task.config["metadata"]["version"] == 2.0, (
"Custom task should have version 2.0"
) )
assert e1 is not None
# Test 2: Verify default is used when no custom path is provided
# run with evaluate() and "arc_easy" test config (included from ./testconfigs path) default_task_manager = tasks.TaskManager(include_defaults=True)
lm = api.registry.get_model(model).create_from_arg_string( default_task_dict = default_task_manager.load_task_or_group(["arc_easy"])
model_args, default_arc_easy = default_task_dict["arc_easy"]
{
"batch_size": None, # Default should not have f1 metric or custom text
"max_batch_size": None, assert not any(
"device": None, metric["metric"] == "f1"
}, for metric in default_arc_easy.config.get("metric_list", [])
), "Default task should not have f1 metric"
assert "Custom Question:" not in default_arc_easy.config["doc_to_text"], (
"Default task should not have custom doc_to_text"
) )
task_name = ["arc_easy"]
def test_include_defaults_false_with_custom_path():
"""Test that when include_defaults=False, only custom tasks are available."""
import tempfile
with tempfile.TemporaryDirectory() as custom_dir:
# Create a custom task using a real dataset
custom_task_content = """task: custom_arc_task
dataset_path: allenai/ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Q: {{question}}\nA:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
custom: true
"""
# Write the custom task file
custom_task_path = os.path.join(custom_dir, "custom_arc_task.yaml")
with open(custom_task_path, "w") as f:
f.write(custom_task_content)
# Initialize with include_defaults=False
task_manager = tasks.TaskManager( task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs", include_defaults=False, include_path=custom_dir
include_defaults=False,
) )
task_dict = tasks.get_task_dict(task_name, task_manager)
e2 = evaluator.evaluate( # Custom task should be available
lm=lm, assert "custom_arc_task" in task_manager.all_tasks, (
task_dict=task_dict, "Custom task should be available when include_defaults=False"
limit=limit,
) )
assert e2 is not None # Default tasks should NOT be available
# check that caching is working assert "arc_easy" not in task_manager.all_tasks, (
"Default arc_easy should not be available when include_defaults=False"
def r(x):
return x["results"]["arc_easy"]
assert all(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
) )
assert "arc_challenge" not in task_manager.all_tasks, (
"Default arc_challenge should not be available when include_defaults=False"
# test that setting include_defaults = False works as expected and that include_path works
def test_no_include_defaults():
task_name = ["arc_easy"]
task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
include_defaults=False,
) )
# should succeed, because we've included an 'arc_easy' task from this dir
task_dict = tasks.get_task_dict(task_name, task_manager)
# should fail, since ./testconfigs has no arc_challenge task
task_name = ["arc_challenge"]
with pytest.raises(KeyError):
task_dict = tasks.get_task_dict(task_name, task_manager) # noqa: F841
# Check that only our custom task is present
assert len(task_manager.all_tasks) == 1, (
f"Should only have 1 task, but found {len(task_manager.all_tasks)}"
)
# test that include_path containing a task shadowing another task's name fails # Check task metadata is correctly loaded
# def test_shadowed_name_fails(): task_info = task_manager.task_index["custom_arc_task"]
assert task_info["type"] == "task"
assert custom_dir in task_info["yaml_path"]
def test_include_defaults_true_with_new_tasks():
"""Test that new tasks from include_path are added alongside default tasks."""
import tempfile
with tempfile.TemporaryDirectory() as custom_dir:
# Create a completely new task (not overriding any default)
new_task_content = """task: arc_custom_generation
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: generate_until
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nGenerate answer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
generation_kwargs:
max_gen_toks: 50
temperature: 0.1
until:
- "\n"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
custom_benchmark: true
"""
# Write the new task file
new_task_path = os.path.join(custom_dir, "arc_custom_generation.yaml")
with open(new_task_path, "w") as f:
f.write(new_task_content)
# Initialize with include_defaults=True (default behavior)
task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)
# Both custom and default tasks should be available
assert "arc_custom_generation" in task_manager.all_tasks, (
"New custom task should be available"
)
assert "arc_easy" in task_manager.all_tasks, (
"Default arc_easy should still be available"
)
assert "arc_challenge" in task_manager.all_tasks, (
"Default arc_challenge should still be available"
)
# task_name = ["arc_easy"] # Check task metadata
custom_task_info = task_manager.task_index["arc_custom_generation"]
assert custom_task_info["type"] == "task"
assert custom_dir in custom_task_info["yaml_path"]
# task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs") # Verify the counts - should have more tasks than just defaults
# task_dict = tasks.get_task_dict(task_name, task_manager) default_only_manager = tasks.TaskManager(include_defaults=True)
assert len(task_manager.all_tasks) > len(default_only_manager.all_tasks), (
"Should have more tasks when including custom path"
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment