Unverified Commit 3d1b8f43 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into group-agg-rework

parents e200c24e d855d0ba
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
......
......@@ -152,6 +152,55 @@ def general_detokenize(string):
return string
def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_model_name(model_name: str) -> str:
"""
Given the model name, returns a sanitized version of it.
"""
return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)
def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))
def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]
def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
......@@ -300,7 +349,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
if "alias" in dic:
k = dic.pop("alias")
for (mf), v in dic.items():
metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",")
if m.endswith("_stderr"):
continue
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.2"
version = "0.4.3"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......
......@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed
completely contaminated and removed
OpenAI used:
```
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import json
import logging
import os
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import argparse
import logging
......
......@@ -70,6 +70,11 @@ def main():
if docs is not None:
iters.append(docs)
if len(iters) == 0:
raise ValueError(
f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
)
docs = join_iters(iters)
with open(
......
......@@ -7,7 +7,12 @@ from pathlib import Path
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger
from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
def parse_args():
......@@ -45,13 +50,15 @@ def main():
assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks.
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = tasks_for_model(model, args.data_path)
model_tasks = set(tasks_for_model(model, args.data_path))
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
......@@ -66,22 +73,36 @@ def main():
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
)
print(model_args)
data = []
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
Path(args.data_path, model, latest_sample_results),
"r",
encoding="utf-8",
) as file:
data = json.loads(file.read())
for line in file:
data.append(json.loads(line.strip()))
configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"]
config = configs[task]
......@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns:
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
# get latest model results for a given name
model_dir = Path(data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
return list(config[0].keys())
......
......@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
......
from typing import List
import pytest
import torch
from lm_eval import tasks
from lm_eval.api.instance import Instance
......@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()
@pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM:
class Test_VLLM:
vllm = pytest.importorskip("vllm")
try:
from lm_eval.models.vllm_causallms import VLLM
......@@ -19,7 +18,7 @@ class TEST_VLLM:
LM = VLLM(pretrained="EleutherAI/pythia-70m")
except ModuleNotFoundError:
pass
torch.use_deterministic_algorithms(True)
# torch.use_deterministic_algorithms(True)
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
......
import os
import re
from typing import List
import pytest
......@@ -6,6 +7,7 @@ import pytest
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks
from lm_eval.utils import make_table
os.environ["TOKENIZERS_PARALLELISM"] = "false"
......@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
10000,
),
],
ids=lambda d: f"{d}",
)
def test_evaluator(
task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
......@@ -75,3 +78,74 @@ def test_evaluator(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
[
(
["ai2_arc"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["lambada_openai"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["wikitext"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
],
ids=lambda d: f"{d}",
)
def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
results = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
bootstrap_iters=0,
random_seed=0,
numpy_random_seed=0,
torch_random_seed=0,
fewshot_random_seed=0,
)
filename = "_".join(
(
"-".join(task_name),
str(limit),
str(model),
re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
)
)
filepath = f"./tests/testdata/{filename}.txt"
with open(filepath, "r") as f:
t1 = f.read().strip()
t2 = make_table(results).strip()
t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
assert len(t1_lines) == len(t2_lines)
for t1_line, t2_line in zip(t1_lines, t2_lines):
t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
assert len(t1_items) == len(t2_items)
for t1_item, t2_item in zip(t1_items, t2_items):
try:
t1_item = float(t1_item)
t2_item = float(t2_item)
assert abs(t1_item - t2_item) < 0.3
except ValueError:
assert t1_item == t2_item
import os
import pytest
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks
@pytest.mark.parametrize(
"limit,model,model_args",
[
(
10,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
),
],
)
def test_include_correctness(limit: int, model: str, model_args: str):
task_name = ["arc_easy"]
task_manager = tasks.TaskManager()
task_dict = tasks.get_task_dict(task_name, task_manager)
e1 = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
)
assert e1 is not None
# run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
lm = api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": None,
"max_batch_size": None,
"device": None,
},
)
task_name = ["arc_easy"]
task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
include_defaults=False,
)
task_dict = tasks.get_task_dict(task_name, task_manager)
e2 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
limit=limit,
)
assert e2 is not None
# check that caching is working
def r(x):
return x["results"]["arc_easy"]
assert all(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
# test that setting include_defaults = False works as expected and that include_path works
def test_no_include_defaults():
task_name = ["arc_easy"]
task_manager = tasks.TaskManager(
include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
include_defaults=False,
)
# should succeed, because we've included an 'arc_easy' task from this dir
task_dict = tasks.get_task_dict(task_name, task_manager)
# should fail, since ./testconfigs has no arc_challenge task
task_name = ["arc_challenge"]
with pytest.raises(KeyError):
task_dict = tasks.get_task_dict(task_name, task_manager) # noqa: F841
# test that include_path containing a task shadowing another task's name fails
# def test_shadowed_name_fails():
# task_name = ["arc_easy"]
# task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
# task_dict = tasks.get_task_dict(task_name, task_manager)
task: arc_easy
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment