Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
from typing import List
import pytest
import torch
from lm_eval import tasks
from lm_eval.api.instance import Instance
......@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()
@pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM:
class Test_VLLM:
vllm = pytest.importorskip("vllm")
try:
from lm_eval.models.vllm_causallms import VLLM
......@@ -19,7 +18,7 @@ class TEST_VLLM:
LM = VLLM(pretrained="EleutherAI/pythia-70m")
except ModuleNotFoundError:
pass
torch.use_deterministic_algorithms(True)
# torch.use_deterministic_algorithms(True)
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
......
import os
import re
from typing import List
import pytest
......@@ -6,6 +7,7 @@ import pytest
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks
from lm_eval.utils import make_table
os.environ["TOKENIZERS_PARALLELISM"] = "false"
......@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
10000,
),
],
ids=lambda d: f"{d}",
)
def test_evaluator(
task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
......@@ -75,3 +78,74 @@ def test_evaluator(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
[
(
["ai2_arc"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["lambada_openai"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["wikitext"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
],
ids=lambda d: f"{d}",
)
def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
results = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
bootstrap_iters=0,
random_seed=0,
numpy_random_seed=0,
torch_random_seed=0,
fewshot_random_seed=0,
)
filename = "_".join(
(
"-".join(task_name),
str(limit),
str(model),
re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
)
)
filepath = f"./tests/testdata/{filename}.txt"
with open(filepath, "r") as f:
t1 = f.read().strip()
t2 = make_table(results).strip()
t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
assert len(t1_lines) == len(t2_lines)
for t1_line, t2_line in zip(t1_lines, t2_lines):
t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
assert len(t1_items) == len(t2_items)
for t1_item, t2_item in zip(t1_items, t2_items):
try:
t1_item = float(t1_item)
t2_item = float(t2_item)
assert abs(t1_item - t2_item) < 0.3
except ValueError:
assert t1_item == t2_item
import os
import pytest
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks
@pytest.mark.parametrize(
"limit,model,model_args",
[
(
10,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
),
],
)
def test_include_correctness(limit: int, model: str, model_args: str):
task_name = ["arc_easy"]
task_manager = tasks.TaskManager()
task_dict = tasks.get_task_dict(task_name, task_manager)
e1 = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
)
assert e1 is not None
# run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
lm = api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": None,
"max_batch_size": None,
"device": None,
},
)
task_name = ["arc_easy"]
task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
include_defaults=False,
)
task_dict = tasks.get_task_dict(task_name, task_manager)
e2 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
limit=limit,
)
assert e2 is not None
# check that caching is working
def r(x):
return x["results"]["arc_easy"]
assert all(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
# test that setting include_defaults = False works as expected and that include_path works
def test_no_include_defaults():
task_name = ["arc_easy"]
task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
include_defaults=False,
)
# should succeed, because we've included an 'arc_easy' task from this dir
task_dict = tasks.get_task_dict(task_name, task_manager)
# should fail, since ./testconfigs has no arc_challenge task
task_name = ["arc_challenge"]
with pytest.raises(KeyError):
task_dict = tasks.get_task_dict(task_name, task_manager) # noqa: F841
# test that include_path containing a task shadowing another task's name fails
# def test_shadowed_name_fails():
# task_name = ["arc_easy"]
# task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
# task_dict = tasks.get_task_dict(task_name, task_manager)
task: arc_easy
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment