Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
group: xcopa
task: xcopa_et
dataset_path: xcopa
dataset_name: et
......
group: xnli
task:
- xnli_ar
- xnli_bg
- xnli_de
- xnli_el
- xnli_en
- xnli_es
- xnli_fr
- xnli_hi
- xnli_ru
- xnli_sw
- xnli_th
- xnli_tr
- xnli_ur
- xnli_vi
- xnli_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group: xnli
task: null
dataset_path: xnli
dataset_name: null
......
......@@ -24,9 +24,9 @@ Homepage: https://github.com/hitz-zentroa/xnli-eu
}
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
#### Tags
* `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.
......
group: xnli
task: null
dataset_path: xnli
dataset_name: null
......
include: xnli_eu.yaml
group: xnli_eu_mt_native
tag: xnli_eu_mt_native
task: xnli_eu_mt
dataset_name: eu_mt
include: xnli_eu.yaml
group: xnli_eu_mt_native
tag: xnli_eu_mt_native
task: xnli_eu_native
training_split: null
validation_split: null
......
group: xstorycloze
task:
- xstorycloze_ar
- xstorycloze_en
- xstorycloze_es
- xstorycloze_eu
- xstorycloze_hi
- xstorycloze_id
- xstorycloze_my
- xstorycloze_ru
- xstorycloze_sw
- xstorycloze_te
- xstorycloze_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
group: xstorycloze
task: xstorycloze_ar
dataset_path: juletxara/xstory_cloze
dataset_name: ar
......
group: xwinograd
task:
- xwinograd_en
- xwinograd_fr
- xwinograd_jp
- xwinograd_pt
- xwinograd_ru
- xwinograd_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group:
- xwinograd
dataset_path: Muennighoff/xwinograd
dataset_name: null # Overridden by language-specific config.
output_type: multiple_choice
......
......@@ -152,6 +152,55 @@ def general_detokenize(string):
return string
def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_model_name(model_name: str) -> str:
"""
Given the model name, returns a sanitized version of it.
"""
return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)
def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))
def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]
def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
......@@ -289,7 +338,9 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
keys = result_dict[column].keys()
if sort_results:
# sort entries alphabetically
# sort entries alphabetically by task or group name.
# NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
# sorting here would mess that up
keys = sorted(keys)
for k in keys:
dic = result_dict[column][k]
......@@ -300,20 +351,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
if "alias" in dic:
k = dic.pop("alias")
for (mf), v in dic.items():
metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",")
if m.endswith("_stderr"):
continue
if v != " ":
v = "%.4f" % v
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
if se != "N/A":
se = "%.4f" % se
values.append([k, version, f, n, m, hib, v, "±", se])
se = " N/A" if se == "N/A" else "%.4f" % se
values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
else:
values.append([k, version, f, n, m, hib, v, "", ""])
k = ""
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.2"
version = "0.4.3"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......@@ -39,7 +39,6 @@ dependencies = [
"dill",
"word2number",
"more_itertools",
"shortuuid",
]
[tool.setuptools.packages.find]
......
......@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed
completely contaminated and removed
OpenAI used:
```
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import json
import logging
import os
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import argparse
import logging
......
......@@ -70,6 +70,11 @@ def main():
if docs is not None:
iters.append(docs)
if len(iters) == 0:
raise ValueError(
f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
)
docs = join_iters(iters)
with open(
......
......@@ -7,7 +7,12 @@ from pathlib import Path
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger
from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
def parse_args():
......@@ -45,13 +50,15 @@ def main():
assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks.
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = tasks_for_model(model, args.data_path)
model_tasks = set(tasks_for_model(model, args.data_path))
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
......@@ -66,22 +73,36 @@ def main():
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
)
print(model_args)
data = []
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
Path(args.data_path, model, latest_sample_results),
"r",
encoding="utf-8",
) as file:
data = json.loads(file.read())
for line in file:
data.append(json.loads(line.strip()))
configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"]
config = configs[task]
......@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns:
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
# get latest model results for a given name
model_dir = Path(data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
return list(config[0].keys())
......
......@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
* `group_name`: `Short description`
#### Tags
* `tag_name`: `Short description`
#### Tasks
* `task_name`: `1-sentence description of what this particular task does`
......
......@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment