Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
group: xcopa
task: xcopa_et task: xcopa_et
dataset_path: xcopa dataset_path: xcopa
dataset_name: et dataset_name: et
......
group: xnli
task:
- xnli_ar
- xnli_bg
- xnli_de
- xnli_el
- xnli_en
- xnli_es
- xnli_fr
- xnli_hi
- xnli_ru
- xnli_sw
- xnli_th
- xnli_tr
- xnli_ur
- xnli_vi
- xnli_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
# This file will be included in the generated language-specific task configs. # This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly # It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness. # by the harness.
group: xnli
task: null task: null
dataset_path: xnli dataset_path: xnli
dataset_name: null dataset_name: null
......
...@@ -24,9 +24,9 @@ Homepage: https://github.com/hitz-zentroa/xnli-eu ...@@ -24,9 +24,9 @@ Homepage: https://github.com/hitz-zentroa/xnli-eu
} }
``` ```
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Tags
* `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset. * `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.
......
group: xnli
task: null task: null
dataset_path: xnli dataset_path: xnli
dataset_name: null dataset_name: null
......
include: xnli_eu.yaml include: xnli_eu.yaml
group: xnli_eu_mt_native tag: xnli_eu_mt_native
task: xnli_eu_mt task: xnli_eu_mt
dataset_name: eu_mt dataset_name: eu_mt
include: xnli_eu.yaml include: xnli_eu.yaml
group: xnli_eu_mt_native tag: xnli_eu_mt_native
task: xnli_eu_native task: xnli_eu_native
training_split: null training_split: null
validation_split: null validation_split: null
......
group: xstorycloze
task:
- xstorycloze_ar
- xstorycloze_en
- xstorycloze_es
- xstorycloze_eu
- xstorycloze_hi
- xstorycloze_id
- xstorycloze_my
- xstorycloze_ru
- xstorycloze_sw
- xstorycloze_te
- xstorycloze_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
group: xstorycloze
task: xstorycloze_ar task: xstorycloze_ar
dataset_path: juletxara/xstory_cloze dataset_path: juletxara/xstory_cloze
dataset_name: ar dataset_name: ar
......
group: xwinograd
task:
- xwinograd_en
- xwinograd_fr
- xwinograd_jp
- xwinograd_pt
- xwinograd_ru
- xwinograd_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
# This file will be included in the generated language-specific task configs. # This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly # It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness. # by the harness.
group:
- xwinograd
dataset_path: Muennighoff/xwinograd dataset_path: Muennighoff/xwinograd
dataset_name: null # Overridden by language-specific config. dataset_name: null # Overridden by language-specific config.
output_type: multiple_choice output_type: multiple_choice
......
...@@ -152,6 +152,55 @@ def general_detokenize(string): ...@@ -152,6 +152,55 @@ def general_detokenize(string):
return string return string
def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_model_name(model_name: str) -> str:
"""
Given the model name, returns a sanitized version of it.
"""
return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)
def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))
def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]
def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len): def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
""" """
- context_len allows for a rolling window context, allowing each prediction window to potentially - context_len allows for a rolling window context, allowing each prediction window to potentially
...@@ -289,7 +338,9 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False) ...@@ -289,7 +338,9 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
keys = result_dict[column].keys() keys = result_dict[column].keys()
if sort_results: if sort_results:
# sort entries alphabetically # sort entries alphabetically by task or group name.
# NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
# sorting here would mess that up
keys = sorted(keys) keys = sorted(keys)
for k in keys: for k in keys:
dic = result_dict[column][k] dic = result_dict[column][k]
...@@ -300,20 +351,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False) ...@@ -300,20 +351,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
if "alias" in dic: if "alias" in dic:
k = dic.pop("alias") k = dic.pop("alias")
for (mf), v in dic.items(): metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",") m, _, f = mf.partition(",")
if m.endswith("_stderr"): if m.endswith("_stderr"):
continue continue
if v != " ":
v = "%.4f" % v
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "") hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
if m + "_stderr" + "," + f in dic: if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f] se = dic[m + "_stderr" + "," + f]
if se != "N/A": se = " N/A" if se == "N/A" else "%.4f" % se
se = "%.4f" % se values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
values.append([k, version, f, n, m, hib, v, "±", se])
else: else:
values.append([k, version, f, n, m, hib, v, "", ""]) values.append([k, version, f, n, m, hib, v, "", ""])
k = "" k = ""
......
...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" ...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "lm_eval" name = "lm_eval"
version = "0.4.2" version = "0.4.3"
authors = [ authors = [
{name="EleutherAI", email="contact@eleuther.ai"} {name="EleutherAI", email="contact@eleuther.ai"}
] ]
...@@ -39,7 +39,6 @@ dependencies = [ ...@@ -39,7 +39,6 @@ dependencies = [
"dill", "dill",
"word2number", "word2number",
"more_itertools", "more_itertools",
"shortuuid",
] ]
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
......
...@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1 ...@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed 3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered 4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed completely contaminated and removed
OpenAI used: OpenAI used:
``` ```
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Usage: Usage:
python make_table_tasks.py --output <markdown_filename> python make_table_tasks.py --output <markdown_filename>
""" """
import json import json
import logging import logging
import os import os
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Usage: Usage:
python make_table_tasks.py --output <markdown_filename> python make_table_tasks.py --output <markdown_filename>
""" """
import argparse import argparse
import logging import logging
......
...@@ -70,6 +70,11 @@ def main(): ...@@ -70,6 +70,11 @@ def main():
if docs is not None: if docs is not None:
iters.append(docs) iters.append(docs)
if len(iters) == 0:
raise ValueError(
f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
)
docs = join_iters(iters) docs = join_iters(iters)
with open( with open(
......
...@@ -7,7 +7,12 @@ from pathlib import Path ...@@ -7,7 +7,12 @@ from pathlib import Path
import pandas as pd import pandas as pd
from zeno_client import ZenoClient, ZenoMetric from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
def parse_args(): def parse_args():
...@@ -45,13 +50,15 @@ def main(): ...@@ -45,13 +50,15 @@ def main():
assert len(models) > 0, "No model directories found in the data_path." assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path)) tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks. # Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy() old_tasks = tasks.copy()
task_count = len(tasks) task_count = len(tasks)
model_tasks = set(tasks_for_model(model, args.data_path))
model_tasks = tasks_for_model(model, args.data_path)
tasks.intersection(set(model_tasks)) tasks.intersection(set(model_tasks))
if task_count != len(tasks): if task_count != len(tasks):
...@@ -66,22 +73,36 @@ def main(): ...@@ -66,22 +73,36 @@ def main():
for task in tasks: for task in tasks:
# Upload data for all models # Upload data for all models
for model_index, model in enumerate(models): for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub( model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", r"[\"<>:/\|\\?\*\[\]]+",
"__", "__",
json.load( json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8") open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"], )["config"]["model_args"],
) )
print(model_args)
data = []
with open( with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), Path(args.data_path, model, latest_sample_results),
"r", "r",
encoding="utf-8", encoding="utf-8",
) as file: ) as file:
data = json.loads(file.read()) for line in file:
data.append(json.loads(line.strip()))
configs = json.load( configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8") open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"] )["configs"]
config = configs[task] config = configs[task]
...@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str): ...@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns: Returns:
list: A list of tasks for the model. list: A list of tasks for the model.
""" """
dir_path = Path(data_path, model) # get latest model results for a given name
config = ( model_dir = Path(data_path, model)
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"], model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
) model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
return list(config[0].keys()) return list(config[0].keys())
......
...@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable` ...@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here BibTeX-formatted citation goes here
``` ```
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Groups
* `group_name`: `Short description` * `group_name`: `Short description`
#### Tags
* `tag_name`: `Short description`
#### Tasks #### Tasks
* `task_name`: `1-sentence description of what this particular task does` * `task_name`: `1-sentence description of what this particular task does`
......
...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [ ...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
] ]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS) @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task): def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string( lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment